In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [57]:
# load datasets into DataFrames
path = 'C:/Users/xinwa/OneDrive/Documents/Data Science/Capstone Project 2/data/'
salaries_by_college_type_df = pd.read_csv(path+'salaries_by_college_type_cleaned.csv')
salaries_by_region_df = pd.read_csv(path+'salaries_by_region_cleaned.csv')


In [58]:
salaries_by_college_type_df.shape, salaries_by_region_df.shape

((269, 6), (320, 6))

In [59]:
salaries_by_college_type_df.columns, salaries_by_region_df.columns

(Index(['School Name', 'School Type', 'Starting Median Salary',
        'Mid-Career Median Salary', 'Mid-Career 25th Percentile Salary',
        'Mid-Career 75th Percentile Salary'],
       dtype='object'),
 Index(['School Name', 'Region', 'Starting Median Salary',
        'Mid-Career Median Salary', 'Mid-Career 25th Percentile Salary',
        'Mid-Career 75th Percentile Salary'],
       dtype='object'))

In [60]:
school_names = set(salaries_by_college_type_df['School Name']).intersection(set(salaries_by_region_df['School Name']))
len(school_names)

248

In [61]:
salaries_by_college_type_df['School Name'].isin(school_names).sum()

268

In [62]:
salaries_by_region_df['School Name'].isin(school_names).sum()

248

In [63]:
merged_college_type_region_df = salaries_by_college_type_df.merge(salaries_by_region_df, how='inner', \
                                                                 on = ['School Name', 'Starting Median Salary', \
                                                                       'Mid-Career Median Salary', \
                                                                       'Mid-Career 25th Percentile Salary', \
                                                                       'Mid-Career 75th Percentile Salary'])
merged_college_type_region_df.head()


Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Region
0,Massachusetts Institute of Technology (MIT),Engineering,72200.0,126000.0,99200.0,168000.0,Northeastern
1,California Institute of Technology (CIT),Engineering,75500.0,123000.0,104000.0,161000.0,California
2,Harvey Mudd College,Engineering,71800.0,122000.0,96000.0,180000.0,California
3,"Polytechnic University of New York, Brooklyn",Engineering,62400.0,114000.0,94300.0,143000.0,Northeastern
4,Cooper Union,Engineering,62200.0,114000.0,80200.0,142000.0,Northeastern


In [64]:
merged_college_type_region_df.shape

(266, 7)

In [65]:
merged_college_type_region_df.columns

Index(['School Name', 'School Type', 'Starting Median Salary',
       'Mid-Career Median Salary', 'Mid-Career 25th Percentile Salary',
       'Mid-Career 75th Percentile Salary', 'Region'],
      dtype='object')

In [66]:
# Linear Regression model without scaling and categorical columns
X = pd.get_dummies(merged_college_type_region_df.drop(['School Name', 'Mid-Career 25th Percentile Salary', \
       'Mid-Career 75th Percentile Salary', 'Mid-Career Median Salary'], axis=1), columns=['School Type', 'Region'])
y = merged_college_type_region_df['Mid-Career Median Salary']
#trian/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [67]:
# fit model on training data, make prediction on testing data and print metrics for model
def model_test(models, X_train, X_test, y_train, y_test):
    metrics_results = {}
    for model in models:
       models[model].fit(X_train, y_train)
       y_pred = models[model].predict(X_test)
       metrics_results[model] = {'r2_score': metrics.r2_score(y_test, y_pred), \
                                 'Mean absolute error': metrics.mean_absolute_error(y_test, y_pred), \
                                 'Mean squared error': metrics.mean_squared_error(y_test, y_pred), \
                                 'Square root of mean squared error': np.sqrt(metrics.mean_squared_error(y_test, y_pred)) \
                                }
    return pd.DataFrame(metrics_results)

In [68]:
# models with default settings and train and test on original data

models = {'OLS' : LinearRegression(), \
          'RandomForest' : RandomForestRegressor(random_state=42)\
         }
model_test(models, X_train, X_test, y_train, y_test)

Unnamed: 0,OLS,RandomForest
r2_score,0.8110148,0.7331163
Mean absolute error,4423.734,5183.918
Mean squared error,31801440.0,44909780.0
Square root of mean squared error,5639.276,6701.476


In [69]:
# models with default settings and train and test on scaled data
scaler = StandardScaler()
scaler.fit(X_train[['Starting Median Salary']])
train_starting_median_salary_scaled = scaler.transform(X_train[['Starting Median Salary']])
X_train_scaled = np.concatenate((train_starting_median_salary_scaled, \
                                X_train.drop(['Starting Median Salary'], axis = 1).to_numpy()), axis=1)
test_starting_median_salary_scaled = scaler.transform(X_test[['Starting Median Salary']])
X_test_scaled = np.concatenate((test_starting_median_salary_scaled, \
                                X_test.drop(['Starting Median Salary'], axis = 1).to_numpy()), axis=1)

model_test(models, X_train_scaled, X_test_scaled, y_train, y_test)


Unnamed: 0,OLS,RandomForest
r2_score,0.8110148,0.7002251
Mean absolute error,4423.734,5398.295
Mean squared error,31801440.0,50444550.0
Square root of mean squared error,5639.276,7102.432


In [70]:
def gridsearchcv(estimator, param_grid, X_train, y_train):
    grid = GridSearchCV(estimator, param_grid = param_grid)
    grid.fit(X_train, y_train)
    return grid.best_params_, grid.best_estimator_

In [71]:
# Using GridSearchCV to tune hyperparameters for models 
rf_params = {'n_estimators': [3, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200]}
rf_best_params, rf_best_estimator = gridsearchcv(RandomForestRegressor(random_state=42), rf_params, X_train, y_train)
rf_best_params, rf_best_estimator

({'n_estimators': 200},
 RandomForestRegressor(n_estimators=200, random_state=42))

In [72]:
models_tuned = {'OLS' : LinearRegression(), \
                'RandomForest' : rf_best_estimator  \
               }
model_test(models_tuned, X_train_scaled, X_test_scaled, y_train, y_test)

Unnamed: 0,OLS,RandomForest
r2_score,0.8110148,0.70988
Mean absolute error,4423.734,5351.232
Mean squared error,31801440.0,48819870.0
Square root of mean squared error,5639.276,6987.122


Conclusion: 
By observing the metrics for the models, we can conclude that Linear Model perform better than Random Forest Model.
By tuning the hyperparameter n_estimators, Random Forest Model performs better than taking the defalt value.

In [73]:
#use Starting Median Salary predicting Mid-Career Median Salary by Linear Model
X_train.columns

Index(['Starting Median Salary', 'School Type_Engineering',
       'School Type_Ivy League', 'School Type_Liberal Arts',
       'School Type_Party', 'School Type_State', 'Region_California',
       'Region_Midwestern', 'Region_Northeastern', 'Region_Southern',
       'Region_Western'],
      dtype='object')

In [74]:
#use only Starting Median Salary predicting Mid-Career Median Salary by Linear Model
model = {'OLS':LinearRegression()}
model_test(model, train_starting_median_salary_scaled, test_starting_median_salary_scaled, y_train, y_test)

Unnamed: 0,OLS
Mean absolute error,4593.745
Mean squared error,36726950.0
Square root of mean squared error,6060.276
r2_score,0.7817441


Conclusion:
Including dummy varaibles in X, the linear model performs better than not inluding them.