In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import OLSInfluence
import seaborn as sns
from patsy import dmatrices, dmatrix
from sklearn import metrics
import seaborn as sns
from scipy import stats

In [28]:
def my_lin_reg(model_formula, df, printMSE = False):
    """
    Function returns the summary for fitted linear model.
    
    Parameter "model_formula" should be a patsy formula describing the model.
    Parameter "df" is a dataframe.
    """
    
    # Split the data into training (80%) and validation set (20%)
    mask = np.random.rand(len(df)) < 0.8
    train = df[mask]
    valid = df[~mask]
 
    # Prepare the data (dmatrices is from patsy library)
    y_train, X_train = dmatrices(model_formula, data=train, return_type='dataframe')   
    y_valid, X_valid = dmatrices(model_formula, data=valid, return_type='dataframe')

    # Train the model
    model = sm.OLS(y_train, X_train) 
    result = model.fit() 

    # Show MSE for training set
    if(printMSE==True):
        y_train_pred = result.predict(X_train)
        print(f'MSE_Train: {metrics.mean_squared_error(y_train, y_train_pred)}')

    # Show MSE for validation set
    if(printMSE==True):
        y_valid_pred = result.predict(X_valid)
        print(f'MSE_Test: {metrics.mean_squared_error(y_valid, y_valid_pred)}\n')

    # Retrun fitted model summary
    return result


In [29]:
df = pd.read_csv("../lab2/data/Auto.csv")

In [30]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
year              int64
origin            int64
name             object
dtype: object

In [31]:
my_lin_reg("mpg ~ year", df, True).summary()

MSE_Train: 40.607047920899745
MSE_Test: 39.83340727079053



0,1,2,3
Dep. Variable:,mpg,R-squared:,0.326
Model:,OLS,Adj. R-squared:,0.324
Method:,Least Squares,F-statistic:,154.3
Date:,"Sun, 11 Dec 2022",Prob (F-statistic):,3.6200000000000003e-29
Time:,16:42:05,Log-Likelihood:,-1050.0
No. Observations:,321,AIC:,2104.0
Df Residuals:,319,BIC:,2111.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-67.2068,7.297,-9.210,0.000,-81.563,-52.851
year,1.1931,0.096,12.422,0.000,1.004,1.382

0,1,2,3
Omnibus:,18.927,Durbin-Watson:,0.878
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14.172
Skew:,0.41,Prob(JB):,0.000837
Kurtosis:,2.378,Cond. No.,1550.0


In [32]:
my_lin_reg("mpg ~ year", df.astype({ "year": "category" }), True).summary()

MSE_Train: 34.84992042582431
MSE_Test: 35.33777151322225



0,1,2,3
Dep. Variable:,mpg,R-squared:,0.435
Model:,OLS,Adj. R-squared:,0.413
Method:,Least Squares,F-statistic:,19.09
Date:,"Sun, 11 Dec 2022",Prob (F-statistic):,1.48e-30
Time:,16:42:06,Log-Likelihood:,-990.28
No. Observations:,310,AIC:,2007.0
Df Residuals:,297,BIC:,2055.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,17.2381,1.316,13.098,0.000,14.648,19.828
year[T.71],4.6619,1.884,2.474,0.014,0.953,8.370
year[T.72],1.8054,1.820,0.992,0.322,-1.777,5.388
year[T.73],-0.5284,1.705,-0.310,0.757,-3.883,2.826
year[T.74],5.5619,1.785,3.115,0.002,2.049,9.075
year[T.75],2.0619,1.884,1.094,0.275,-1.647,5.770
year[T.76],4.5536,1.802,2.527,0.012,1.007,8.100
year[T.77],6.3810,1.861,3.428,0.001,2.718,10.044
year[T.78],6.4393,1.705,3.778,0.000,3.085,9.794

0,1,2,3
Omnibus:,12.699,Durbin-Watson:,1.008
Prob(Omnibus):,0.002,Jarque-Bera (JB):,9.815
Skew:,0.336,Prob(JB):,0.00739
Kurtosis:,2.445,Cond. No.,14.8
