**Car Regression**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

df = pd.read_csv('winequalityN.csv')

# Drop Rows with NaN Values inplace
df.dropna(inplace=True)

df.sample

<bound method NDFrame.sample of        type  fixed acidity  volatile acidity  citric acid  residual sugar  \
0     white            7.0             0.270         0.36            20.7   
1     white            6.3             0.300         0.34             1.6   
2     white            8.1             0.280         0.40             6.9   
3     white            7.2             0.230         0.32             8.5   
4     white            7.2             0.230         0.32             8.5   
...     ...            ...               ...          ...             ...   
6491    red            6.8             0.620         0.08             1.9   
6492    red            6.2             0.600         0.08             2.0   
6494    red            6.3             0.510         0.13             2.3   
6495    red            5.9             0.645         0.12             2.0   
6496    red            6.0             0.310         0.47             3.6   

      chlorides  free sulfur dioxide  total

In [2]:
#make new dataframe of the important variables
import re
labels = df['quality'].copy()
#print(labels)
df2 = df.drop(df.iloc[:, -1:],axis = 1)
important_data = df2
print(important_data.iloc[1])

type                    white
fixed acidity             6.3
volatile acidity          0.3
citric acid              0.34
residual sugar            1.6
chlorides               0.049
free sulfur dioxide      14.0
total sulfur dioxide    132.0
density                 0.994
pH                        3.3
sulphates                0.49
alcohol                   9.5
Name: 1, dtype: object


In [3]:
avg = labels.mean()
print(avg)

5.818505338078292


In [4]:
#Split the dataset into chunks train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(important_data, labels, test_size=0.25)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4847, 12)
(1616, 12)
(4847,)
(1616,)


**One hot encode values**

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
numeric_transformer = Pipeline([
    ('std_scaler', StandardScaler())
])

categorical_features = ['type']
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

x_train = preprocessor.fit_transform(X_train)
x_test = preprocessor.transform(X_test)

In [6]:
print(x_train.shape)
print(x_test.shape)

(4847, 13)
(1616, 13)


In [7]:
#Oldschool standardize
mean = y_train.mean()
std = y_train.std()

y_train -= mean              # use the mean & std
y_train /= std 
y_test -= mean              # use the mean & std
y_test /= std 

In [7]:
print(y_train.shape)
print(y_test.shape)

(4847,)
(1616,)


**Random Forest**

In [8]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 500, max_features =1.0, random_state = 18).fit(x_train, y_train)

In [9]:
from sklearn.metrics import mean_squared_error
# Predict on test data
prediction = rf.predict(x_test)
# Compute mean squared error
mse = mean_squared_error(prediction, y_test)
# Print results
print('MSE:',mse)
print('MAE:',mse*.5)

MSE: 0.34401480940594065
MAE: 0.17200740470297032


In [10]:
forest_predictions = rf.predict(x_test)
forest_mse = mean_squared_error(y_test, forest_predictions)
forest_rmse = np.sqrt(forest_mse)
forest_rmse

0.5865277567225107

In [11]:
print(forest_predictions)

[5.04  5.09  5.688 ... 5.98  5.002 5.58 ]


In [17]:
for i in range (40):
    realval = y_test.iloc[i]
    print("prediction",round((forest_predictions[i])),"real:",realval)

prediction 5 real: 5
prediction 5 real: 6
prediction 6 real: 5
prediction 6 real: 5
prediction 7 real: 8
prediction 6 real: 6
prediction 5 real: 5
prediction 6 real: 6
prediction 6 real: 6
prediction 6 real: 6
prediction 6 real: 6
prediction 7 real: 7
prediction 5 real: 6
prediction 5 real: 5
prediction 5 real: 5
prediction 6 real: 6
prediction 6 real: 6
prediction 5 real: 6
prediction 5 real: 5
prediction 6 real: 6
prediction 5 real: 6
prediction 5 real: 5
prediction 6 real: 8
prediction 5 real: 3
prediction 7 real: 7
prediction 5 real: 5
prediction 7 real: 7
prediction 6 real: 6
prediction 6 real: 6
prediction 6 real: 6
prediction 5 real: 5
prediction 6 real: 6
prediction 6 real: 7
prediction 5 real: 5
prediction 6 real: 5
prediction 6 real: 6
prediction 6 real: 6
prediction 5 real: 5
prediction 6 real: 7
prediction 6 real: 5


In [20]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train,y_train)

linearpreds = lin_reg.predict(x_train)

print("prediction;",linearpreds[10])
print("label;",y_test.iloc[10])

preds = lin_reg.predict(x_test)
lin_mse = mean_squared_error(y_test,preds)
print(np.sqrt(lin_mse))
print(lin_mse)

prediction; 6.1092889878670835
label; 6
0.7162017500132015
0.5129449467219724


In [25]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train,y_train)

preds = tree_reg.predict(x_test)
tree_mse = mean_squared_error(preds,y_test)
print(tree_mse)

0.6695544554455446


In [27]:
from sklearn.model_selection import cross_val_score

cvs = cross_val_score(tree_reg,x_train,y_train,scoring="neg_mean_squared_error",cv=10)

tree_rmse_scores = np.sqrt(-cvs)
def display_scores(scores):
    print("Scores:",scores)
    print("Mean",scores.mean())
    print("STD",scores.std())

display_scores(tree_rmse_scores)

Scores: [0.85795249 0.7732646  0.8686997  0.81985666 0.89211899 0.89672945
 0.83850628 0.82697297 0.89188259 0.90681533]
Mean 0.8572799071589255
STD 0.040350992739978706


In [33]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
    'n_estimators': [20,200],
    'max_features': randint(low=2, high=16),
}


forest_reg2 = RandomForestRegressor(random_state=12)
rnd_search = RandomizedSearchCV(forest_reg2,param_distributions=param_distribs,n_jobs=1,n_iter=10,cv=5
,scoring='neg_mean_squared_error',random_state=12)

rnd_search.fit(x_train,y_train)

ovres = rnd_search.cv_results_
for mean_scores,params in zip(ovres["mean_test_score"],ovres["params"]):
    print(np.sqrt(-mean_scores), params)


0.6327352699761896 {'max_features': 13, 'n_estimators': 200}
0.6281899448969719 {'max_features': 8, 'n_estimators': 200}
0.6442461888365777 {'max_features': 3, 'n_estimators': 20}
0.628123708837502 {'max_features': 5, 'n_estimators': 200}
0.6478165230113501 {'max_features': 14, 'n_estimators': 20}
0.6281899448969719 {'max_features': 8, 'n_estimators': 200}
0.6298285946953031 {'max_features': 6, 'n_estimators': 200}
0.6327352699761896 {'max_features': 15, 'n_estimators': 200}
0.6275333483528395 {'max_features': 4, 'n_estimators': 200}
0.6465739747654384 {'max_features': 8, 'n_estimators': 20}


In [31]:
from sklearn.model_selection import GridSearchCV
import sklearn

param_grid = [ {'n_neighbors': range(1,7)}]

neigh_reg = sklearn.neighbors.KNeighborsRegressor()
grid_search = GridSearchCV(neigh_reg,param_grid,cv=5,n_jobs=-1,scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(x_train,y_train)