## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## Load Dataset

In [2]:
data = pd.read_csv('data/revenue-data.csv')

In [3]:
data.head()

Unnamed: 0,Age,SiteSpending,SiteTime,RecommendImpression,Education,WorkType,Sex,Region,Salary
0,32,314.06,30.14,0,Degree,Private sector,Male,London,60173.49
1,20,3758.36,149.36,4,GCSE,Private sector,Female,South East,42965.45
2,36,601.72,21.87,0,Masters,Private sector,Male,East of England,54924.41
3,21,44.89,182.8,9,Masters,Private sector,Female,Northern Ireland,26734.99
4,24,614.8,194.34,0,GCSE,Private sector,Male,Scotland,15325.23


## Data Processing

### Label Encoding

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

data["Education"] = le.fit_transform(data['Education'])
data["WorkType"] = le.fit_transform(data["WorkType"])
data['Sex'] = le.fit_transform(data['Sex'])
data['Region'] =  le.fit_transform(data['Region'])
data.head()

Unnamed: 0,Age,SiteSpending,SiteTime,RecommendImpression,Education,WorkType,Sex,Region,Salary
0,32,314.06,30.14,0,1,0,1,2,60173.49
1,20,3758.36,149.36,4,2,0,0,7,42965.45
2,36,601.72,21.87,0,3,0,1,1,54924.41
3,21,44.89,182.8,9,3,0,0,5,26734.99
4,24,614.8,194.34,0,2,0,1,6,15325.23


### Feature Scaling

In [5]:
from sklearn import preprocessing
x = data.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
data_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(data_scaled)

In [6]:
df.columns = data.columns

In [7]:
df.head()

Unnamed: 0,Age,SiteSpending,SiteTime,RecommendImpression,Education,WorkType,Sex,Region,Salary
0,0.206349,0.062878,0.084627,0.0,0.166667,0.0,1.0,0.181818,0.35947
1,0.015873,0.854345,0.48771,0.235294,0.333333,0.0,0.0,0.636364,0.229875
2,0.269841,0.128979,0.056666,0.0,0.5,0.0,1.0,0.090909,0.319939
3,0.031746,0.001025,0.600771,0.529412,0.5,0.0,0.0,0.454545,0.107643
4,0.079365,0.131985,0.639788,0.0,0.333333,0.0,1.0,0.545455,0.021716


## Support Vector Regressor

In [8]:
X = df.drop(columns= ["Salary"])
y = df['Salary']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

SVR()

In [11]:
y_pred = regressor.predict(X_test)

In [12]:
test_score = regressor.score(X_test, y_test)

In [13]:
test_score

0.6438353477663408

### Evaluation Metrics

In [24]:
def adjusted_r2_score(actual, predictions, num_pred, num_samples):
    n = num_samples
    k = num_pred
    r2 = r2_score(actual, predictions)
    adjusted_r2 = 1 - ((1-r2) * ((n-1)/(n-k-1)))
    return adjusted_r2

In [25]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

num_samples = X_test.shape[0]
num_predictors = X_test.shape[1]
adjusted_r2_score(y_test, y_pred, num_predictors, num_samples)

0.6438353477663408
0.08242692270783723
0.009853586395787635
0.09926523256300583


0.6340438796636972

## Decision Tree Regressor

In [26]:
from sklearn import tree

reg = tree.DecisionTreeRegressor()
reg.fit(X_train, y_train)

DecisionTreeRegressor()

In [27]:
y_pred = reg.predict(X_test)

### Evaluation Metrics

In [31]:
from sklearn.metrics import r2_score

print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

num_samples = X_test.shape[0]
num_predictors = X_test.shape[1]
adjusted_r2_score(y_test, y_pred, num_predictors, num_samples)

0.7276209778325664
0.06022726648565096
0.00753558841534393
0.08680776702198904


0.7201328947489256

## Random Forrest Regressor

In [32]:
from sklearn.ensemble import RandomForestRegressor

rf =RandomForestRegressor()

rf.fit(X_train, y_train)

RandomForestRegressor()

In [33]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(700, 8)
(300, 8)
(700,)
(300,)


In [34]:
y_pred = rf.predict(X_test)

### Evaluation metrics

In [36]:
from sklearn.metrics import r2_score

print(r2_score(y_test, y_pred))

print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

num_samples = X_test.shape[0]
num_predictors = X_test.shape[1]
adjusted_r2_score(y_test, y_pred, num_predictors, num_samples)

0.8977406847696814
0.03840539031818529
0.0028290875893405175
0.05318916796999665


0.8949294321173015