Regression Model <br>

In [86]:
# Import Statements
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [80]:
# Preparing the data for regression

# Creating new dataframe

# Features we want: ride_time_mins, price, temperature, rain
regression_features = ['price','ride_time_mins','temp','rain']
regressors = regression_features[1:len(regression_features)]

cleaned_data = pd.read_csv("data/cleaned.csv")
cleaned_data['rain'] = cleaned_data['rain'].replace(np.nan, 0)
regression_data = cleaned_data.loc[:,regression_features]
regression_data.head(5)

# One hot encoding on categorical values


Unnamed: 0,price,ride_time_mins,temp,rain
0,5.0,585,38.46,0.0
1,11.0,105,44.26,0.7625
2,7.0,45,38.87,0.0
3,26.0,292,35.08,0.0
4,9.0,212,37.58,0.0


In [81]:
# Split data into training and testing
y = regression_data['price']
X = regression_data[regressors]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state=1)


In [93]:
# Choose regression model and train it
reg = Lasso(alpha=0.1,random_state=1)
reg.fit(X_train, y_train)
predicted_results = reg.predict(X=X_test)

scores_regr = mean_squared_error(y_test, predicted_results)
print(scores_regr)

86.88975886069488


In [89]:
# Optimizing alpha

    # TASK: Build a vectorizer / classifier pipeline that filters out tokens
    # that are too rare or too frequent
pipeline = Pipeline([
    ('reg', Lasso()),
])

# TASK: Build a grid search to find out whether unigrams or bigrams are
# more useful.
# Fit the pipeline on the training set using grid search for the parameters
parameters = {
    'reg__alpha': [0.0001,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
    'reg__normalize': [True, False]
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# TASK: print the mean and std for each candidate along with the parameter
# settings for all the candidates explored by grid search.
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
    print(i, 'params - %s; mean - %0.2f; std - %0.2f'
                % (grid_search.cv_results_['params'][i],
                grid_search.cv_results_['mean_test_score'][i],
                grid_search.cv_results_['std_test_score'][i]))

# TASK: Predict the outcome on the testing set and store it in a variable
# named y_predicted
y_predicted = grid_search.predict(X_test)


0 params - {'clf__alpha': 0.0001, 'clf__normalize': True}; mean - -86.96; std - 0.43
1 params - {'clf__alpha': 0.0001, 'clf__normalize': False}; mean - -86.96; std - 0.43
2 params - {'clf__alpha': 0.1, 'clf__normalize': True}; mean - -86.96; std - 0.43
3 params - {'clf__alpha': 0.1, 'clf__normalize': False}; mean - -86.96; std - 0.43
4 params - {'clf__alpha': 0.2, 'clf__normalize': True}; mean - -86.96; std - 0.43
5 params - {'clf__alpha': 0.2, 'clf__normalize': False}; mean - -86.96; std - 0.43
6 params - {'clf__alpha': 0.3, 'clf__normalize': True}; mean - -86.96; std - 0.43
7 params - {'clf__alpha': 0.3, 'clf__normalize': False}; mean - -86.96; std - 0.43
8 params - {'clf__alpha': 0.4, 'clf__normalize': True}; mean - -86.96; std - 0.43
9 params - {'clf__alpha': 0.4, 'clf__normalize': False}; mean - -86.96; std - 0.43
10 params - {'clf__alpha': 0.5, 'clf__normalize': True}; mean - -86.96; std - 0.43
11 params - {'clf__alpha': 0.5, 'clf__normalize': False}; mean - -86.96; std - 0.43
12

In [96]:
# Elastic Net regression

reg2 = ElasticNet(alpha=0.1,l1_ratio= 0.1, random_state=1)
reg2.fit(X_train, y_train)

predicted_results2 = reg2.predict(X=X_test)

scores_regr2 = mean_squared_error(y_test, predicted_results2)
print(scores_regr2)

86.88990567534742


In [99]:
# Optimizing alpha and l1_ratio

    # TASK: Build a vectorizer / classifier pipeline that filters out tokens
    # that are too rare or too frequent
pipeline = Pipeline([
    ('reg2', ElasticNet()),
])

# TASK: Build a grid search to find out whether unigrams or bigrams are
# more useful.
# Fit the pipeline on the training set using grid search for the parameters
parameters = {
    'reg2__alpha': [0.0001,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
    'reg2__l1_ratio': [0.0001,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# TASK: print the mean and std for each candidate along with the parameter
# settings for all the candidates explored by grid search.
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
    print(i, 'params - %s; mean - %0.2f; std - %0.2f'
                % (grid_search.cv_results_['params'][i],
                grid_search.cv_results_['mean_test_score'][i],
                grid_search.cv_results_['std_test_score'][i]))

# TASK: Predict the outcome on the testing set and store it in a variable
# named y_predicted
y_predicted = grid_search.predict(X_test)

KeyboardInterrupt: 