<a href="https://colab.research.google.com/github/swapnilbetkar/python/blob/Dev_Class/Code_Mohapatra_Doyle_Betkar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
import plotly.graph_objs as go
import plotly.figure_factory as ff
import statsmodels.api as sm
import plotly.graph_objs as go
import warnings
warnings.filterwarnings('ignore')

## Importing and analyzing the dataset
dataset = pd.read_csv("/content/drive/MyDrive/Datasets/SuperMart.csv")
pd.set_option('display.max_columns', None)
print(dataset.head())
print(dataset.shape)
print(dataset.info())
print(dataset.describe())

##### Converting Categorical features into Numerical features

categorical_features = ['Marital_Status']

#Using map function for converting categorcal features to numerical features
dataset['Age_Group'] = dataset['Age_Group'].map({'>60':2,'30-60':1,'<30' :0})
dataset['Income_Group'] = dataset['Income_Group'].map({'>=90K':3,'60K-89K':2,'30K-59K' :1,'<30K':0})
dataset['Education'] = dataset['Education'].map({'PhD':4,'Master':3,'Graduation' :2,'High School':1,'Middle School':0})

#Using One-Hot Encoding technique
final_data = pd.get_dummies(dataset, columns = categorical_features)
print(final_data.info())
print(final_data.head(2))

### Feature Selection
# Plotting Correlation Heatmap
corrs = dataset.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()

# Dividing dataset into label and feature sets
X = final_data.drop(['Revenue','Income_Group'], axis = 1) # Features
Y = final_data['Revenue'] # Labels
print(type(X))
print(type(Y))
print(X.shape)
print(Y.shape)

### Normalizing numerical features so that each feature has mean 0 and variance 1
feature_scaler = StandardScaler()
X_scaled = feature_scaler.fit_transform(X)

#Implementing Linear Regression
# Tuning the SGDRegressor parameters 'eta0' (learning rate) and 'max_iter' using Grid Search
sgdr = SGDRegressor(random_state = 1)
grid_param = {'eta0': [0.00001,.0001, .001], 'max_iter':[10000, 20000]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

#Building SGDRegressor using the tuned parameters
sgdr = SGDRegressor(eta0=.0001, max_iter=10000, penalty=None, random_state=1)
sgdr.fit(X_scaled,Y)
print('Intercept', sgdr.intercept_)
print(pd.DataFrame(zip(X.columns, sgdr.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

#Implementing L2 Regularization (Ridge Regression)
#Tuning Regularization parameter alpha
sgdr = SGDRegressor(eta0=.0001, max_iter=10000, penalty='l2', random_state=1)
grid_param = {'alpha': [0.00001,.0001, .001, .01, .1, 1]}

gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)

gd_sr.fit(X_scaled, Y)

best_parameters = gd_sr.best_params_
print(best_parameters)

best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

# Building SGDRegressor using the tuned parameters for Ridge Regularization
sgdr = SGDRegressor(eta0=.0001, max_iter=10000, penalty='l2', alpha=0.001, random_state=1)
sgdr.fit(X_scaled,Y)
print('Intercept', sgdr.intercept_)
print(pd.DataFrame(zip(X.columns, sgdr.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

# Implementing L1 Regularization (Lasso Regression)
# Tuning Regularization parameter alpha
sgdr = SGDRegressor(eta0=.0001, max_iter=10000, penalty='l1', random_state=1)
grid_param = {'alpha': [0.1,1,10,25,50,75]}
gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)
gd_sr.fit(X_scaled, Y)
best_parameters = gd_sr.best_params_
print(best_parameters)
best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

# Building SGDRegressor using the tuned parameters for Lasso Regularization
sgdr = SGDRegressor(eta0=.0001, max_iter=10000, penalty='l1', alpha=10, random_state=1)
sgdr.fit(X_scaled,Y)
print('Intercept', sgdr.intercept_)
print(pd.DataFrame(zip(X.columns, sgdr.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

# Implementing Elastic Net Regularization (Elastic Net Regression)
# Tuning Regularization parameter alpha and l1_ratio
sgdr = SGDRegressor(eta0=.0001, max_iter=10000, penalty='elasticnet', random_state=1)
grid_param = {'alpha': [.0001, .001, .01, .1, 1,10,50,80,100],'l1_ratio':[0, 0.1, 0.3,0.5,0.7,0.9,1]}
gd_sr = GridSearchCV(estimator=sgdr, param_grid=grid_param, scoring='r2', cv=5)
gd_sr.fit(X_scaled, Y)
best_parameters = gd_sr.best_params_
print(best_parameters)
best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print(best_result)

## Building SGDRegressor using the tuned parameters for Elastic Net
sgdr = SGDRegressor(eta0=.0001, max_iter=10000, penalty='elasticnet', alpha=10, l1_ratio=1, random_state=1)
sgdr.fit(X_scaled,Y)
print('Intercept', sgdr.intercept_)
print(pd.DataFrame(zip(X.columns, sgdr.coef_), columns=['Features','Coefficients']).sort_values(by=['Coefficients'],ascending=False))

# Implementing L2 Regularized Support Vector Regression
# Tuning the SVR parameters 'kernel', 'C', 'epsilon' and implementing cross-validation using Grid Search
svr = SVR()
grid_param = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [100,1000,10000], 'epsilon': [10,100,1000,10000]}
gd_sr = GridSearchCV(estimator=svr, param_grid=grid_param, scoring='r2', cv=5)
gd_sr.fit(X_scaled, Y)
best_parameters = gd_sr.best_params_
print("Optimal parameters:\n", best_parameters)
best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best mean cross-validated score:\n", best_result)

# Implementing Random Forest Regression
## Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='mse', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [100,200,300,450,500]}
gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)
gd_sr.fit(X_scaled, Y)
best_parameters = gd_sr.best_params_
print("Optimal parameters:\n", best_parameters)
best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best mean cross-validated score:\n", best_result)

# Building random forest using the tuned parameter
rfr = RandomForestRegressor(n_estimators=450, criterion='mse', max_features='sqrt', random_state=1)
rfr.fit(X_scaled,Y)
featimp = pd.Series(rfr.feature_importances_, index=list(X)).sort_values(ascending=False)
print(featimp)
X_ = final_data[['NumWebPurchases', 'NumStorePurchases', 'Dependents','AcceptedCmp5','NumDealsPurchases','AcceptedCmp1','Education','AcceptedCmp6','AcceptedCmp4','Age_Group']]
feature_scaler = StandardScaler()
X_scaled_ = feature_scaler.fit_transform(X_)

## Tuning the random forest parameter 'n_estimators' and implementing cross-validation using Grid Search
rfr = RandomForestRegressor(criterion='mse', max_features='sqrt', random_state=1)
grid_param = {'n_estimators': [450,500,550,600,650,700,750]}
gd_sr = GridSearchCV(estimator=rfr, param_grid=grid_param, scoring='r2', cv=5)
gd_sr.fit(X_scaled_, Y)
best_parameters = gd_sr.best_params_
print("Optimal parameters:\n", best_parameters)
best_result = gd_sr.best_score_ # Mean cross-validated score of the best_estimator
print("Best mean cross-validated score:\n", best_result)

# Implement p-value for analysis of significant features
final_data.info()
X_ = sm.add_constant(X_scaled)
est = sm.OLS(Y, X_) 
model = est.fit()
print(model.summary()) 