In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [29]:
# Read the data using pandas and make a copy
df = pd.read_csv('../datasets/views/2020/data-us-2020.csv', na_values='?', comment = '\t', sep=',', skipinitialspace=True)

data = df.copy()

# Dropping Country column as it is all US data in this file
data.drop(['Country'], axis=1, inplace=True)

In [30]:
data.sample(15)

Unnamed: 0,ConvertedComp,EdLevel,YearsCode
1827,150000.0,8,51
1086,180000.0,5,40
3608,120000.0,5,20
7452,55000.0,5,6
7078,65000.0,5,10
5152,95000.0,6,25
8030,1080.0,3,10
4413,105000.0,5,16
3341,120000.0,6,7
7506,55000.0,5,30


In [31]:
# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

In [32]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

In [33]:

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
# Fitting the model to the data
model = RandomForestRegressor()
model.fit(X_train, y_train);
model.score(X_test, y_test)

0.09610071208402649

In [35]:
# Making a prediction
# y_label = regr.predict(np.array([0, 2, 3, 4]))

In [36]:
y_preds = model.predict(X_test)
y_preds

array([110256.19097736, 103056.67139278,  82969.51703297, ...,
       121257.00336142,  96861.09321282, 111374.80739007])

In [37]:
y_test

7587     50000.0
2069    149000.0
7621     50000.0
2063    150000.0
7733     45000.0
          ...   
5890     83522.0
373     250000.0
7793     42000.0
5840     85000.0
1945    150000.0
Name: ConvertedComp, Length: 1617, dtype: float64

In [38]:
# Evaluating this model on the training data and the test data
model.score(X_train, y_train)

0.1855195599182644

In [39]:
model.score(X_test, y_test)

0.09610071208402649

In [40]:
np.random.seed(42)
for i in range(10, 200, 10):
    print(f'Trying model with {i} estimators...')
    model = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {model.score(X_test, y_test) * 100}%')
    print(' ')

Trying model with 10 estimators...
Model accuracy on test set: 9.46494792769249%
 
Trying model with 20 estimators...
Model accuracy on test set: 8.921108944995847%
 
Trying model with 30 estimators...
Model accuracy on test set: 9.099018247058567%
 
Trying model with 40 estimators...
Model accuracy on test set: 8.941171792903834%
 
Trying model with 50 estimators...
Model accuracy on test set: 9.883109904976084%
 
Trying model with 60 estimators...
Model accuracy on test set: 9.791823008875067%
 
Trying model with 70 estimators...
Model accuracy on test set: 9.414085694324903%
 
Trying model with 80 estimators...
Model accuracy on test set: 9.5937968500867%
 
Trying model with 90 estimators...
Model accuracy on test set: 9.431943945603683%
 
Trying model with 100 estimators...
Model accuracy on test set: 9.533801719475898%
 
Trying model with 110 estimators...
Model accuracy on test set: 9.608118930856191%
 
Trying model with 120 estimators...
Model accuracy on test set: 9.35793202609

In [41]:
import pickle

pickle.dump(model, open('test_model.pkl', 'wb'))

In [42]:
loaded_model = pickle.load(open('test_model.pkl', 'rb'))
loaded_model.score(X_test, y_test)

0.09544319079234875

# divider

In [43]:
# ml map shows to try lasso or elasticnet if few features are important
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train);

In [44]:
reg.score(X_test, y_test) * 100

14.065679901160511

In [45]:
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

np.random.seed(42)

model2 = ElasticNet(random_state=0)
model2.fit(X_train, y_train)

model2.score(X_test, y_test)

0.1393219394505294

### ml map shows to try ridgeregression or SVR kernel linear if few features are important

In [46]:
# RidgeRegression
from sklearn.linear_model import Ridge

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Ridge
regr3 = Ridge()
regr3.fit(X_train, y_train)

regr3.score(X_test, y_test)

0.14065656807772442

In [47]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_linear = svm.SVR(kernel='linear')
svr_linear.fit(X_train, y_train)

svr_linear.score(X_test, y_test)

0.09471349035298737

In [48]:
# ml map shows to try SVR kernel rbf or ensembleregressors if nothing else worked

In [49]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_rbf = svm.SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

svr_rbf.score(X_test, y_test)

-0.031223715099687688

In [50]:
# EnsembleRegressors
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

rf.score(X_test, y_test)

0.09610071208402649

In [51]:
from sklearn.linear_model import LinearRegression

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
lreg = LinearRegression()
lreg.fit(X_train, y_train)

lreg.score(X_test, y_test)

0.14065684126765576

In [52]:
# trying svc linear classification
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svcc = svm.SVC()
svcc.fit(X_train, y_train)

svcc.score(X_test, y_test)

0.032158317872603585

In [53]:
svcc.predict([[0, 1]])

array([80000.])

In [54]:
svcc.predict([[2, 8]])

array([100000.])

# divider