In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [5]:
# Read the data using pandas and make a copy
df = pd.read_csv('../datasets/views/2020/data-us-2020.csv', na_values='?', comment = '\t', sep=',', skipinitialspace=True)

data = df.copy()

# Dropping Country column as it is all US data in this file
data.drop(['Country'], axis=1, inplace=True)

In [6]:
data.sample(15)

Unnamed: 0,ConvertedComp,EdLevel,YearsCode,YearsCodePro
804,95275.0,5,5,1
3901,127500.0,6,13,11
446,115000.0,3,4,0
1531,135000.0,5,7,3
6709,55000.0,8,25,14
5524,160000.0,6,20,10
8011,240000.0,5,43,39
6365,130000.0,5,24,15
3324,121000.0,5,11,3
6474,195000.0,5,25,20


In [7]:
# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

In [8]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

In [9]:

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
# Fitting the model to the data
model = RandomForestRegressor()
model.fit(X_train, y_train);
model.score(X_test, y_test)

0.062172198735375606

In [11]:
app_test = model.predict([1,1,1])

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 1.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [90]:
# Making a prediction
# y_label = regr.predict(np.array([0, 2, 3, 4]))

In [91]:
y_preds = model.predict(X_test)
y_preds

array([112207.76161353, 149540.35714286, 231583.01587302, ...,
       109242.05586914,  87468.51      ,  99620.80919972])

In [92]:
y_test

2684    150000.0
6383    105000.0
5891    180000.0
6358    140000.0
5374     72000.0
          ...   
88       85000.0
734     120000.0
196     150000.0
7652     70000.0
3002     54000.0
Name: ConvertedComp, Length: 1670, dtype: float64

In [93]:
# Evaluating this model on the training data and the test data
model.score(X_train, y_train)

0.47911317591040226

In [94]:
model.score(X_test, y_test)

0.062172198735375606

In [95]:
np.random.seed(42)
for i in range(10, 200, 10):
    print(f'Trying model with {i} estimators...')
    model = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {model.score(X_test, y_test) * 100}%')
    print(' ')

Trying model with 10 estimators...
Model accuracy on test set: 1.6084603066399583%
 
Trying model with 20 estimators...
Model accuracy on test set: 5.418700186716374%
 
Trying model with 30 estimators...
Model accuracy on test set: 4.175144600452652%
 
Trying model with 40 estimators...
Model accuracy on test set: 6.4760175070528785%
 
Trying model with 50 estimators...
Model accuracy on test set: 6.453339986167739%
 
Trying model with 60 estimators...
Model accuracy on test set: 5.58218181541863%
 
Trying model with 70 estimators...
Model accuracy on test set: 5.850301999750885%
 
Trying model with 80 estimators...
Model accuracy on test set: 5.595028883899744%
 
Trying model with 90 estimators...
Model accuracy on test set: 6.306602306365006%
 
Trying model with 100 estimators...
Model accuracy on test set: 7.112772327980165%
 
Trying model with 110 estimators...
Model accuracy on test set: 6.335937201862507%
 
Trying model with 120 estimators...
Model accuracy on test set: 6.1790571

In [96]:
import pickle

pickle.dump(model, open('test_model.pkl', 'wb'))

In [97]:
loaded_model = pickle.load(open('test_model.pkl', 'rb'))
loaded_model.score(X_test, y_test)

0.06508475497852395

# divider

In [98]:
# ml map shows to try lasso or elasticnet if few features are important
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X_train, y_train);

In [99]:
reg.score(X_test, y_test) * 100

17.644192792381872

In [100]:
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

np.random.seed(42)

model2 = ElasticNet(random_state=0)
model2.fit(X_train, y_train)

model2.score(X_test, y_test)

0.1753329068119207

### ml map shows to try ridgeregression or SVR kernel linear if few features are important

In [101]:
# RidgeRegression
from sklearn.linear_model import Ridge

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Ridge
regr3 = Ridge()
regr3.fit(X_train, y_train)

regr3.score(X_test, y_test)

0.1764423461756297

In [102]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_linear = svm.SVR(kernel='linear')
svr_linear.fit(X_train, y_train)

svr_linear.score(X_test, y_test)

0.14014883948079182

In [103]:
# ml map shows to try SVR kernel rbf or ensembleregressors if nothing else worked

In [104]:
# SVR kernel='rbf'
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svr_rbf = svm.SVR(kernel='rbf')
svr_rbf.fit(X_train, y_train)

svr_rbf.score(X_test, y_test)

-0.018659878833494847

In [105]:
# EnsembleRegressors
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

rf.score(X_test, y_test)

0.062172198735375606

In [106]:
from sklearn.linear_model import LinearRegression

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
lreg = LinearRegression()
lreg.fit(X_train, y_train)

lreg.score(X_test, y_test)

0.176441938103564

In [107]:
# trying svc linear classification
from sklearn import svm

np.random.seed(42)

# Creating the features matrix
X = data.drop('ConvertedComp', axis=1)

# Creating the labels
y = data['ConvertedComp']

# splitting into training and testing data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate Random Forest Regressor
svcc = svm.SVC()
svcc.fit(X_train, y_train)

svcc.score(X_test, y_test)

0.04251497005988024

In [108]:
svcc.predict([[4, 10, 10]])

array([120000.])

In [109]:
svcc.predict([[1, 3, 3]])

array([75000.])

# divider