In [103]:
import sys
import sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import csv

# Data Cleaning

In [104]:
# Helper functions
def convert_si_to_number(x):
    total_stars = 0
    if 'k' in x:
        if len(x) > 1:
            total_stars = float(x.replace('k', '')) * 1000 # convert k to a thousand
    elif 'M' in x:
        if len(x) > 1:
            total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
    elif 'B' in x:
        total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
    else:
        total_stars = int(x) # Less than 1000
    
    return int(total_stars)

In [105]:
# Read file
file = open("../data/merged.csv")
reader = csv.reader(file)
i = 0
y = []
X = []
names = []
next(reader)
for row in reader:
    name = row[1]
    industry = row[4]
    size_range = row[5]
    location = row[6].split(",")
    city = location[0].strip()
    state = location[1].strip()
    currentEmployeeEstimate = row[9]
    totalEmployeeEstimate = row[10]
    score = row[11]
    reviews = row[12]
    salaries = row[13]
    interviews = row[14]
    reviews = 0 if reviews == "--" else reviews
    salaries = 0 if salaries == "--" else salaries
    interviews = 0 if interviews == "--" else interviews
    
    i = i + 1
    
    # Add names
    names.append(name)
    
    # Add y labels
    y.append(score)
    
    # Add x features
    X.append([industry, size_range, city, state, currentEmployeeEstimate, totalEmployeeEstimate, reviews, salaries, interviews])

In [106]:
X[0]

['retail',
 '10001+',
 'withee',
 'wisconsin',
 '120753',
 '272827',
 '48000.0',
 '56000.0',
 '9000.0']

In [107]:
# Convert to pandas dataframe
X = np.array(X)
y = np.array(y)
X = pd.DataFrame({'Industry': X[:, 0], 'Size_Range': X[:, 1], 'City': X[:, 2], 'State': X[:, 3], 'CurrentEmployeeEstimate': X[:, 4], 'TotalEmployeeEstimate': X[:, 5], 'Reviews': X[:, 6], 'Salaries': X[:, 7], 'Interviews': X[:, 8]})

In [108]:
X.shape

(34758, 9)

In [109]:
# Label Encoder
le = LabelEncoder()
categorical_feature_mask = X.dtypes==object
categorical_cols = X.columns[[0, 2, 6, 7]].tolist()
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

In [110]:
# One hot encoding
# enc = preprocessing.OneHotEncoder()
# enc.fit(X_2)
# X_3 = enc.transform(X_2).toarray()
# X_3.shape
ohe = OneHotEncoder(categorical_features=[0, 2, 6, 7])
X_2 = ohe.fit_transform(X)

In [111]:
# Convert to floats
X_2 = X_2.astype(np.float)
y = y.astype(np.float)

# Modeling

In [112]:
# WITH 1 FEATURE
X_Experimental = X_2.toarray()[:,-1:]
x_train, x_test, y_train, y_test = train_test_split(X_Experimental, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.46818095475936455

In [113]:
model = LinearRegression()
scores = cross_val_score(model, X_Experimental, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.32300821, -0.41824997, -0.46637291, -0.53849636, -0.59160542])

In [114]:
# WITH ALL FEATURES WITHOUT ONE HOT ENCODING
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.47111310002495976

In [115]:
model = LinearRegression()
scores = cross_val_score(model, X, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.87793422, -0.41748855, -0.46550401, -0.53786223, -0.59184744])

In [117]:
# WITH ALL FEATURES WITH ONE HOT ENCODING
# Train, test, split, calculate error
x_train, x_test, y_train, y_test = train_test_split(X_2, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.48165572102454673

In [118]:
model = LinearRegression()
scores = cross_val_score(model, X_2, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.69819627, -0.43945324, -0.47608438, -0.54572644, -0.61053225])

In [119]:
# Train model on training data
model = LinearRegression()
model.fit(X_2, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [120]:
# Make predictions
predictions = model.predict(X_2)

In [121]:
# Calculate statistics on training data
error = mean_squared_error(y, predictions)
w = model.coef_
b = model.intercept_
error, w, b

(0.3732065724339587,
 array([-9.15496863e-02, -3.39691589e-01, -8.18432723e-02, ...,
        -2.53025989e-05, -2.23592589e-05, -1.37581767e-06]),
 3.4842280913746477)

# Results

Features without location: 
error, w, b is (0.45079620978786505,
 array([ 0.00647003, -0.00115187, -0.00028681]),
 3.3909317414654145)

5-fold cross validation returns
array([-0.44058222, -0.46424376, -0.44806606, -0.44477971, -0.45780714])

 
All features: 
error, w, b is (0.3475201663575248,
 array([-8.63861463e-01, -1.59436776e-01,  7.42368542e-01, ...,
         6.88801126e-03, -1.46312267e-03, -2.93404668e-04]),
 3.3865073437888973)

5-fold cross validation returns
array([-0.49178862, -0.53366115, -0.52459808, -0.52376639, -0.53658507])