In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

In [None]:
raw_data=pd.read_csv("1.04. Real-life example.csv")
raw_data.head()

# Preprocessing

In [None]:
raw_data.describe(include='all')

In [None]:
data=raw_data.drop(['Model'],axis=1) #axis=0 for rows,1 for cols
data.describe(include='all')

In [None]:
data.isnull().sum()

In [None]:
#rule of thumb
#if removing <5% of the obs,you are free to just remove all that have missing values


In [None]:
#adhock method
data_no_mv=data.dropna(axis=0)

In [None]:
data_no_mv.describe(include='all')

In [None]:
sns.distplot(data_no_mv['Price'])

# dealing with outliers

In [None]:
q=data_no_mv['Price'].quantile(0.99) #actually a value
data_1=data_no_mv[data_no_mv['Price']<q]
data_1.describe(include='all')

In [None]:
sns.distplot(data_1['Price'])

In [None]:
sns.distplot(data_no_mv['Mileage'])

In [None]:
q=data_no_mv['Mileage'].quantile(0.99) #actually a value
data_2=data_no_mv[data_no_mv['Mileage']<q]

In [None]:
sns.distplot(data_2['Mileage'])

In [None]:
q=data_1['Mileage'].quantile(0.99) #actually a value
data_2=data_1[data_1['Mileage']<q]

In [None]:
sns.distplot(data_2['Mileage'])

In [None]:
sns.distplot(data_no_mv['EngineV'])

In [None]:
data_3=data_2[data_2['EngineV']<6.5]

In [None]:
sns.distplot(data_3['EngineV'])

In [None]:
sns.distplot(data_no_mv['Year'])

In [None]:
q=data_3['Year'].quantile(0.01) #actually a value
data_4=data_3[data_3['Year']>q]

In [None]:
sns.distplot(data_4['Year'])

In [None]:
data_cleaned=data_4.reset_index(drop=True)

In [None]:
data_cleaned.describe(include='all')

In [None]:
raw_data.describe(include='all')

# checking OLS assumptions

In [None]:
f,(ax1,ax2,ax3)=plt.subplots(1, 3 ,sharey=True ,figsize=(15,3))
ax1.scatter(data_cleaned['Year'],data_cleaned['Price'])
ax1.set_title('Price and Year')
ax2.scatter(data_cleaned['EngineV'],data_cleaned['Price'])
ax2.set_title('Price and EngineV')
ax3.scatter(data_cleaned['Mileage'],data_cleaned['Price'])
ax3.set_title('Price and Mileage')

plt.show()

In [None]:
sns.distplot(data_cleaned['Price'])

In [None]:
log_price=np.log(data_cleaned['Price'])
data_cleaned['log_price']=log_price
data_cleaned


#logtransformation

In [None]:
f,(ax1,ax2,ax3)=plt.subplots(1, 3 ,sharey=True ,figsize=(15,3))
ax1.scatter(data_cleaned['Year'],data_cleaned['log_price'])
ax1.set_title('Price and Year')
ax2.scatter(data_cleaned['EngineV'],data_cleaned['log_price'])
ax2.set_title('Price and EngineV')
ax3.scatter(data_cleaned['Mileage'],data_cleaned['log_price'])
ax3.set_title('Price and Mileage')

plt.show()

In [None]:
data_cleaned= data_cleaned.drop(['Price'], axis=1) #no longer needed

# Multicollinearity

In [None]:
data_cleaned.columns.values

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables=data_cleaned[['Mileage','Year','EngineV']]
vif=pd.DataFrame()
vif["VIF"]=[variance_inflation_factor(variables.values,i) for i in range(variables.shape[1])]
vif['features']=variables.columns

In [None]:
vif

In [None]:
data_no_multicollinearity=data_cleaned.drop(['Year'],axis=1)

## create dummy variables

In [None]:
 data_with_dummies=pd.get_dummies(data_no_multicollinearity, drop_first=True)

In [None]:
data_with_dummies.head()

## rearrange a bit

In [None]:
data_with_dummies.columns.values

In [None]:
cols=['log_price','Mileage', 'EngineV', 'Brand_BMW',
       'Brand_Mercedes-Benz', 'Brand_Mitsubishi', 'Brand_Renault',
       'Brand_Toyota', 'Brand_Volkswagen', 'Body_hatch', 'Body_other',
       'Body_sedan', 'Body_vagon', 'Body_van', 'Engine Type_Gas',
       'Engine Type_Other', 'Engine Type_Petrol', 'Registration_yes']

In [None]:
data_preprocessed=data_with_dummies[cols]
data_preprocessed.head()

In [None]:
variables=data_preprocessed
vif=pd.DataFrame()
vif["VIF"]=[variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["Features"]=variables.columns
vif

In [None]:
# Let's simply drop log_price from data_preprocessed
variables = data_preprocessed.drop(['log_price'],axis=1)
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["features"] = variables.columns
vif

In [None]:
# To solve this one, we must create a new variable with dummies, without dropping the first one
data_with_dummies_new = pd.get_dummies(data_no_multicollinearity)#, drop_first=True)
data_with_dummies_new.head()

In [None]:
# Let's simply drop 'log_price' from this new variable
variables = data_with_dummies_new.drop(['log_price'],axis=1)
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["features"] = variables.columns
vif

# linear regression model

In [None]:
targets=data_preprocessed["log_price"]
inputs=data_preprocessed.drop(["log_price"],axis=1)

In [None]:
#scalling the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(inputs)

In [None]:
inputs_scaled=scaler.transform(inputs)

In [None]:
#train test split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(inputs_scaled, targets, test_size=0.2, random_state=365)

In [None]:
reg=LinearRegression()
reg.fit(x_train,y_train)

In [None]:
y_hat=reg.predict(x_train)

In [None]:
plt.scatter(y_train,y_hat)
plt.xlabel("targets(y_train)",size=18)
plt.ylabel("Predictions(y_hat)",size=18)
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()

In [None]:
sns.distplot(y_train - y_hat)
plt.title("Residual PDF",size=18)


In [None]:
reg.score(x_train,y_train)

# finding the weights and bias

In [None]:
reg.intercept_

In [None]:
reg.coef_

In [None]:
reg_summary=pd.DataFrame(inputs.columns.values, columns=["Features"])
reg_summary['Weights']=reg.coef_
reg_summary

In [None]:
data_cleaned["Brand"].unique()

# testing

In [None]:
y_hat_test=reg.predict(x_test)

In [None]:
plt.scatter(y_test,y_hat_test, alpha=0.2)
plt.xlabel("targets(y_test)",size=18)
plt.ylabel("Predictions(y_hat_test)",size=18)
plt.xlim(6,13)
plt.ylim(6,13)
plt.show()

In [None]:
df_pf=pd.DataFrame(np.exp(y_hat_test),columns=['Prediction'])
df_pf.head()

In [None]:
df_pf['Target']=np.exp(y_test)
df_pf

In [None]:
y_test=y_test.reset_index(drop=True)
y_test.head()

In [None]:
df_pf['Target']=np.exp(y_test)
df_pf

In [None]:
df_pf["Residual"]=df_pf['Target']-df_pf['Prediction']

In [None]:
df_pf["Difference %"]=np.absolute(df_pf["Residual"]/df_pf["Target"]*100)

In [None]:
df_pf

In [None]:
df_pf.describe()

In [None]:
pd.options.display.max_rows=999
pd.set_option('display.float_format',lambda x:'%.2f' % x)
df_pf.sort_values(by=["Difference %"])