In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')

In [41]:
# Read the data using pandas and make a copy
df = pd.read_csv('../datasets/views/2020/data-us-2020.csv', na_values='?', comment = '\t', sep=',', skipinitialspace=True)

data = df.copy()

# Dropping Country column as it is all US data in this file
data.drop(['Country'], axis=1, inplace=True)

In [42]:
data.sample(15)

Unnamed: 0,ConvertedComp,EdLevel,YearsCode
1775,110000.0,5,12
4406,130000.0,8,11
5084,19200.0,5,4
7952,72800.0,5,8
6958,150000.0,4,39
3058,250000.0,8,15
626,60000.0,6,12
4732,80000.0,6,18
2114,145000.0,4,19
665,115000.0,6,15


In [61]:
# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

In [62]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

In [63]:
from sklearn.model_selection import train_test_split

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [65]:
# Fitting the model to the data
model = RandomForestRegressor()
model.fit(X_train, y_train);
model.score(X_test, y_test)

-0.03333312753648743

In [47]:
# Making a prediction
# y_label = regr.predict(np.array([0, 2, 3, 4]))

In [66]:
y_preds = model.predict(X_test)
y_preds

array([211195.27283169,  82928.00021898, 169149.18604521, ...,
       198477.01891546, 191458.56147062, 169149.18604521])

In [67]:
y_test

7587    142800.0
2069    104000.0
7621     80000.0
2063    125000.0
7733     85000.0
          ...   
5890     82500.0
373     120000.0
7793    124000.0
5840    105000.0
1945     95000.0
Name: ConvertedComp, Length: 1617, dtype: float64

In [68]:
# Evaluating this model on the training data and the test data
model.score(X_train, y_train)

0.04938450345635337

In [69]:
model.score(X_test, y_test)

-0.03333312753648743

In [70]:
np.random.seed(42)
for i in range(10, 200, 10):
    print(f'Trying model with {i} estimators...')
    regr = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {model.score(X_test, y_test) * -100}%')
    print(' ')

Trying model with 10 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 20 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 30 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 40 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 50 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 60 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 70 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 80 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 90 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 100 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 110 estimators...
Model accuracy on test set: 3.333312753648743%
 
Trying model with 120 estimators...
Model accuracy on test set: 3.33331275

In [55]:
import pickle

pickle.dump(regr, open('test_model.pkl', 'wb'))

In [56]:
loaded_model = pickle.load(open('test_model.pkl', 'rb'))
loaded_model.score(X_test, y_test)

-0.07048831950012535

# divider

In [73]:
# ml map shows to try lasso or elasticnet if few features are important
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train);

In [76]:
reg.score(X_test, y_test) * 100

-0.07163788380153413

In [79]:
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

np.random.seed(42)

model2 = ElasticNet(random_state=0)
model2.fit(X_train, y_train)

model2.score(X_test, y_test)

-0.0006323858861854159

In [None]:
# ml map shows to try ridgeregression or SVR kernel linear if few features are important


In [84]:
# RidgeRegression
from sklearn.linear_model import Ridge

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Ridge
regr3 = Ridge()
regr3.fit(X_train, y_train)

regr3.score(X_test, y_test)

-0.0007163517902706307

In [86]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_linear = svm.SVR(kernel='linear')
svr_linear.fit(X_train, y_train)

svr_linear.score(X_test, y_test)

-0.0731481045616269

In [None]:
# ml map shows to try SVR kernel rbf or ensembleregressors if nothing else worked

In [85]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_rbf = svm.SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

svr_rbf.score(X_test, y_test)

-0.06970985520547868

In [81]:
# EnsembleRegressors
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

rf.score(X_test, y_test)

-0.03059067084682643

In [90]:
from sklearn.linear_model import LinearRegression

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
lreg = LinearRegression()
lreg.fit(X_train, y_train)

# lreg.score(X_test, y_test)

LinearRegression()

In [101]:
from sklearn.metrics import mean_squared_error

salary_preds = lreg.predict(y_train)
lin_mse = mean_squared_error(X_train, salary_preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

ValueError: Expected 2D array, got 1D array instead:
array=[ 140000.  250000.  111000. ...  100000. 2000000.  188000.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [102]:
# trying svc linear classification
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svcc = svm.SVC()
svcc.fit(X_train, y_train)

svcc.score(X_test, y_test)

0.05318491032776747

In [107]:
svcc.predict([[0, 1]])

array([2000000.])

# divider

In [57]:
data_cat = data[['EdLevel']]
data_cat.head()

Unnamed: 0,EdLevel
0,5
1,4
2,5
3,5
4,5


In [58]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)

In [59]:
data_cat_1hot.toarray()[:5]

array([[0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.]])

In [60]:
cat_encoder.categories_

[array([0, 1, 2, 3, 4, 5, 6, 7, 8])]