# Predicting Used Car Price

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
import dmba

In [None]:
data = pd.read_csv('ToyotaCorolla.csv')

In [None]:
data.head(2)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
plt.figure(figsize=(10,5))
plt.hist(data['Price'][:1000])

In [None]:
data.columns

#### Fitting a Regression Model

In [None]:
# reduce data frame to the top 1000 rows and select columns
df = data.iloc[0:1000]
predictors = ['Age_08_04','KM','Fuel_Type','HP','Met_Color',
             'Automatic','CC','Doors','Quarterly_Tax','Weight'
             ]
outcome = 'Price'

In [None]:
df[predictors].head()

In [None]:
df[outcome].head()

#### Partition the data into X and y

In [None]:
X = pd.get_dummies(df[predictors],drop_first=True)
y = df[outcome]

#### Train and test split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.4, random_state=1)

In [None]:
car_lm = LinearRegression()
car_lm.fit(x_train, y_train)
#print coefficients
print (pd.DataFrame({'Predictor': X.columns, 'coefficient': car_lm.coef_}))

In [None]:
#print performance measures (training data)
dmba.regressionSummary(y_train,car_lm.predict(x_train))

#### Prediction 

In [None]:
y_pred = car_lm.predict(x_test)
result = pd.DataFrame({'Predicted':y_pred,
                       'Actual':y_test,
                       'Residual': y_test - y_pred})
print (result.head())

In [None]:
# print performace measures (test dataset)
dmba.regressionSummary(y_test,y_pred)

#### Plot Residuals

In [None]:
res =  y_test - y_pred
pd.DataFrame(res).hist(bins=25)
plt.show()

## ```sklearn.metrics```

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
# The coefficients
print('Coefficients: \n', car_lm.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('(R^2)Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))

In [None]:
print ('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
AdjustedR2 = 1 - (1-car_lm.score(x_train, y_train))*(len(y_train)-1)/(len(y_train)-x_train.shape[1]-1)
print ('Train dataset Adjusted R^2: ',AdjustedR2)

In [None]:
AdjustedR2 = 1 - (1-car_lm.score(x_test, y_test))*(len(y_test)-1)/(len(y_test)-x_test.shape[1]-1)
print ('Test dataset Adjusted R^2: ',AdjustedR2)