In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [29]:
# Read the data using pandas and make a copy
df = pd.read_csv('../datasets/views/2020/data-us-2020.csv', na_values='?', comment = '\t', sep=',', skipinitialspace=True)

data = df.copy()

# Dropping Country column as it is all US data in this file
data.drop(['Country'], axis=1, inplace=True)

In [30]:
data.sample(15)

Unnamed: 0,ConvertedComp,EdLevel,YearsCode
6231,83000.0,6,12
3344,125000.0,6,6
7276,67000.0,5,22
1547,170000.0,5,20
7078,70000.0,6,5
3423,125000.0,6,18
4703,36000.0,5,10
785,220000.0,5,12
3277,127920.0,5,16
2298,150000.0,6,32


In [31]:
# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

In [32]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

In [33]:

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
# Fitting the model to the data
model = RandomForestRegressor()
model.fit(X_train, y_train);
model.score(X_test, y_test)

0.11764542941651512

In [35]:
# Making a prediction
# y_label = regr.predict(np.array([0, 2, 3, 4]))

In [36]:
y_preds = model.predict(X_test)
y_preds

array([128709.42181617, 167671.77761861, 122892.89821235, ...,
        95702.71838142, 144132.40956964, 155509.6663942 ])

In [37]:
y_test

1835    160000.0
2728    140000.0
7977     48000.0
932     202500.0
5356     95000.0
          ...   
4942    100000.0
1097    200000.0
548     255000.0
2897    135000.0
2991    132000.0
Name: ConvertedComp, Length: 1667, dtype: float64

In [38]:
# Evaluating this model on the training data and the test data
model.score(X_train, y_train)

0.2466910428573572

In [39]:
model.score(X_test, y_test)

0.11764542941651512

In [40]:
np.random.seed(42)
for i in range(10, 200, 10):
    print(f'Trying model with {i} estimators...')
    model = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {model.score(X_test, y_test) * 100}%')
    print(' ')

Trying model with 10 estimators...
Model accuracy on test set: 10.598925103602419%
 
Trying model with 20 estimators...
Model accuracy on test set: 11.248991175709078%
 
Trying model with 30 estimators...
Model accuracy on test set: 11.262806024697468%
 
Trying model with 40 estimators...
Model accuracy on test set: 11.856546173124272%
 
Trying model with 50 estimators...
Model accuracy on test set: 11.60275930067426%
 
Trying model with 60 estimators...
Model accuracy on test set: 11.37368816152472%
 
Trying model with 70 estimators...
Model accuracy on test set: 11.542249175433883%
 
Trying model with 80 estimators...
Model accuracy on test set: 11.352538126593181%
 
Trying model with 90 estimators...
Model accuracy on test set: 11.552880581982695%
 
Trying model with 100 estimators...
Model accuracy on test set: 11.270479627513852%
 
Trying model with 110 estimators...
Model accuracy on test set: 11.710836699917326%
 
Trying model with 120 estimators...
Model accuracy on test set: 1

In [41]:
import pickle

pickle.dump(model, open('test_model.pkl', 'wb'))

In [42]:
loaded_model = pickle.load(open('test_model.pkl', 'rb'))
loaded_model.score(X_test, y_test)

0.1145225438457973

# divider

In [43]:
# ml map shows to try lasso or elasticnet if few features are important
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train);

In [44]:
reg.score(X_test, y_test) * 100

14.145968995346736

In [45]:
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

np.random.seed(42)

model2 = ElasticNet(random_state=0)
model2.fit(X_train, y_train)

model2.score(X_test, y_test)

0.14002426159909975

### ml map shows to try ridgeregression or SVR kernel linear if few features are important

In [46]:
# RidgeRegression
from sklearn.linear_model import Ridge

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Ridge
regr3 = Ridge()
regr3.fit(X_train, y_train)

regr3.score(X_test, y_test)

0.14145984339838213

In [47]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_linear = svm.SVR(kernel='linear')
svr_linear.fit(X_train, y_train)

svr_linear.score(X_test, y_test)

0.09087305705569815

In [48]:
# ml map shows to try SVR kernel rbf or ensembleregressors if nothing else worked

In [49]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_rbf = svm.SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

svr_rbf.score(X_test, y_test)

-0.03131346886905795

In [50]:
# EnsembleRegressors
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

rf.score(X_test, y_test)

0.11764542941651512

In [51]:
from sklearn.linear_model import LinearRegression

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
lreg = LinearRegression()
lreg.fit(X_train, y_train)

lreg.score(X_test, y_test)

0.1414596677489407

In [52]:
# trying svc linear classification
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svcc = svm.SVC()
svcc.fit(X_train, y_train)

svcc.score(X_test, y_test)

0.04079184163167367

In [53]:
svcc.predict([[0, 1]])

array([75000.])

In [54]:
svcc.predict([[2, 8]])

array([80000.])

# divider