In [167]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')

In [168]:
# Read the data using pandas and make a copy
df = pd.read_csv('../datasets/views/2020/data-us-2020.csv', na_values='?', comment = '\t', sep=',', skipinitialspace=True)

data = df.copy()

# Dropping Country column as it is all US data in this file
data.drop(['Country'], axis=1, inplace=True)

In [169]:
data.sample(15)

Unnamed: 0,ConvertedComp,EdLevel,YearsCode
4003,110000.0,5,10
5332,90000.0,6,4
914,190000.0,3,26
7435,56000.0,6,25
832,200000.0,8,45
4239,108000.0,5,3
5819,85000.0,6,10
6914,68000.0,5,13
3798,115000.0,5,10
3541,120000.0,5,20


In [170]:
# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

In [171]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

In [172]:
from sklearn.model_selection import train_test_split

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [173]:
# Fitting the model to the data
model = RandomForestRegressor()
model.fit(X_train, y_train);
model.score(X_test, y_test)

0.09610071208402649

In [174]:
# Making a prediction
# y_label = regr.predict(np.array([0, 2, 3, 4]))

In [175]:
y_preds = model.predict(X_test)
y_preds

array([110256.19097736, 103056.67139278,  82969.51703297, ...,
       121257.00336142,  96861.09321282, 111374.80739007])

In [176]:
y_test

7587     50000.0
2069    149000.0
7621     50000.0
2063    150000.0
7733     45000.0
          ...   
5890     83522.0
373     250000.0
7793     42000.0
5840     85000.0
1945    150000.0
Name: ConvertedComp, Length: 1617, dtype: float64

In [177]:
# Evaluating this model on the training data and the test data
model.score(X_train, y_train)

0.1855195599182644

In [178]:
model.score(X_test, y_test)

0.09610071208402649

In [199]:
np.random.seed(42)
for i in range(10, 200, 10):
    print(f'Trying model with {i} estimators...')
    model = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {model.score(X_test, y_test) * 100}%')
    print(' ')

Trying model with 10 estimators...
Model accuracy on test set: 9.46494792769249%
 
Trying model with 20 estimators...
Model accuracy on test set: 8.921108944995847%
 
Trying model with 30 estimators...
Model accuracy on test set: 9.099018247058567%
 
Trying model with 40 estimators...
Model accuracy on test set: 8.941171792903834%
 
Trying model with 50 estimators...
Model accuracy on test set: 9.883109904976084%
 
Trying model with 60 estimators...
Model accuracy on test set: 9.791823008875067%
 
Trying model with 70 estimators...
Model accuracy on test set: 9.414085694324903%
 
Trying model with 80 estimators...
Model accuracy on test set: 9.5937968500867%
 
Trying model with 90 estimators...
Model accuracy on test set: 9.431943945603683%
 
Trying model with 100 estimators...
Model accuracy on test set: 9.533801719475898%
 
Trying model with 110 estimators...
Model accuracy on test set: 9.608118930856191%
 
Trying model with 120 estimators...
Model accuracy on test set: 9.35793202609

In [180]:
import pickle

pickle.dump(regr, open('test_model.pkl', 'wb'))

In [181]:
loaded_model = pickle.load(open('test_model.pkl', 'rb'))
loaded_model.score(X_test, y_test)

0.09544319079234875

# divider

In [182]:
# ml map shows to try lasso or elasticnet if few features are important
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train);

In [183]:
reg.score(X_test, y_test) * 100

14.065679901160511

In [184]:
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

np.random.seed(42)

model2 = ElasticNet(random_state=0)
model2.fit(X_train, y_train)

model2.score(X_test, y_test)

0.1393219394505294

In [185]:
# ml map shows to try ridgeregression or SVR kernel linear if few features are important


In [186]:
# RidgeRegression
from sklearn.linear_model import Ridge

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Ridge
regr3 = Ridge()
regr3.fit(X_train, y_train)

regr3.score(X_test, y_test)

0.14065656807772442

In [187]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_linear = svm.SVR(kernel='linear')
svr_linear.fit(X_train, y_train)

svr_linear.score(X_test, y_test)

0.09471349035298737

In [188]:
# ml map shows to try SVR kernel rbf or ensembleregressors if nothing else worked

In [189]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_rbf = svm.SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

svr_rbf.score(X_test, y_test)

-0.031223715099687688

In [190]:
# EnsembleRegressors
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

rf.score(X_test, y_test)

0.09610071208402649

In [191]:
from sklearn.linear_model import LinearRegression

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
lreg = LinearRegression()
lreg.fit(X_train, y_train)

# lreg.score(X_test, y_test)

LinearRegression()

In [192]:
# from sklearn.metrics import mean_squared_error

# salary_preds = lreg.predict(y_train)
# lin_mse = mean_squared_error(X_train, salary_preds)
# lin_rmse = np.sqrt(lin_mse)
# lin_rmse

In [193]:
# trying svc linear classification
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svcc = svm.SVC()
svcc.fit(X_train, y_train)

svcc.score(X_test, y_test)

0.032158317872603585

In [194]:
svcc.predict([[0, 1]])

array([80000.])

# divider

In [195]:
data_cat = data[['EdLevel']]
data_cat.head()

Unnamed: 0,EdLevel
0,5
1,2
2,7
3,5
4,5


In [196]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)

In [197]:
data_cat_1hot.toarray()[:5]

array([[0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.]])

In [198]:
cat_encoder.categories_

[array([0, 1, 2, 3, 4, 5, 6, 7, 8])]