In [2]:
import sys
import sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import csv

# Data Cleaning

In [3]:
# Helper functions
def convert_si_to_number(x):
    total_stars = 0
    if 'k' in x:
        if len(x) > 1:
            total_stars = float(x.replace('k', '')) * 1000 # convert k to a thousand
    elif 'M' in x:
        if len(x) > 1:
            total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
    elif 'B' in x:
        total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
    else:
        total_stars = int(x) # Less than 1000
    
    return int(total_stars)

In [4]:
# Read file
file = open("../data/merged.csv")
reader = csv.reader(file)
i = 0
y = []
X = []
names = []
next(reader)
for row in reader:
    name = row[1]
    industry = row[4]
    size_range = row[5]
    location = row[6].split(",")
    city = location[0].strip()
    state = location[1].strip()
    currentEmployeeEstimate = row[9]
    totalEmployeeEstimate = row[10]
    score = row[11]
    reviews = row[12]
    salaries = row[13]
    interviews = row[14]
    reviews = 0 if reviews == "--" else reviews
    salaries = 0 if salaries == "--" else salaries
    interviews = 0 if interviews == "--" else interviews
    
    i = i + 1
    
    # Add names
    names.append(name)
    
    # Add y labels
    y.append(score)
    
    # Add x features
    X.append([industry, size_range, city, state, currentEmployeeEstimate, totalEmployeeEstimate, reviews, salaries, interviews])

In [5]:
X[0]

['retail',
 '10001+',
 'withee',
 'wisconsin',
 '120753',
 '272827',
 '48000.0',
 '56000.0',
 '9000.0']

In [6]:
# Convert to pandas dataframe
X = np.array(X)
y = np.array(y)
X = pd.DataFrame({'Industry': X[:, 0], 'Size_Range': X[:, 1], 'City': X[:, 2], 'State': X[:, 3], 'CurrentEmployeeEstimate': X[:, 4], 'TotalEmployeeEstimate': X[:, 5], 'Reviews': X[:, 6], 'Salaries': X[:, 7], 'Interviews': X[:, 8]})

In [7]:
X

Unnamed: 0,City,CurrentEmployeeEstimate,Industry,Interviews,Reviews,Salaries,Size_Range,State,TotalEmployeeEstimate
0,withee,120753,retail,9000.0,48000.0,56000.0,10001+,wisconsin,272827
1,minneapolis,60602,retail,11000.0,34000.0,44000.0,10001+,minnesota,204360
2,seattle,93247,internet,21000.0,37000.0,47000.0,10001+,washington,161866
3,alexandria,162163,military,1800.0,22000.0,19000.0,10001+,virginia,445958
4,dallas,115188,telecommunications,3900.0,23000.0,29000.0,10001+,texas,269659
5,atlanta,44630,retail,4800.0,22000.0,27000.0,10001+,georgia,114799
6,seattle,46523,retail,5300.0,24000.0,29000.0,10001+,washington,147107
7,charlotte,97357,banking,5700.0,22000.0,35000.0,10001+,north carolina,279769
8,new york,274047,information technology and services,9000.0,51000.0,76000.0,10001+,new york,716906
9,basking ridge,37368,information technology and services,4100.0,21000.0,26000.0,10001+,new jersey,85643


In [8]:
# Label Encoder
le = LabelEncoder()
categorical_feature_mask = X.dtypes==object
categorical_cols = X.columns[[0, 2, 6, 7]].tolist()
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

In [18]:
X

Unnamed: 0,City,CurrentEmployeeEstimate,Industry,Interviews,Reviews,Salaries,Size_Range,State,TotalEmployeeEstimate
0,4263,120753,124,9000.0,48000.0,56000.0,1,607,272827
1,2461,60602,124,11000.0,34000.0,44000.0,1,351,204360
2,3483,93247,67,21000.0,37000.0,47000.0,1,588,161866
3,57,162163,88,1800.0,22000.0,19000.0,1,575,445958
4,944,115188,132,3900.0,23000.0,29000.0,1,533,269659
5,171,44630,124,4800.0,22000.0,27000.0,1,197,114799
6,3483,46523,124,5300.0,24000.0,29000.0,1,588,147107
7,710,97357,10,5700.0,22000.0,35000.0,1,388,279769
8,2665,274047,63,9000.0,51000.0,76000.0,1,375,716906
9,256,37368,63,4100.0,21000.0,26000.0,1,372,85643


In [19]:
# One hot encoding
# enc = preprocessing.OneHotEncoder()
# enc.fit(X_2)
# X_3 = enc.transform(X_2).toarray()
# X_3.shape
ohe = OneHotEncoder(categorical_features=[0, 2, 6, 7], sparse=True)
X_2 = ohe.fit_transform(X)

In [20]:
# Convert to floats
X_2 = X_2.astype(np.float)
y = y.astype(np.float)

In [86]:
len(X_2[0])

TypeError: 'coo_matrix' object does not support indexing

In [87]:
print(X_2)

  (31721, 0)	1.0
  (18684, 1)	1.0
  (10417, 2)	1.0
  (34420, 3)	1.0
  (3165, 4)	1.0
  (7956, 5)	1.0
  (8052, 6)	1.0
  (1144, 7)	1.0
  (6883, 7)	1.0
  (15521, 7)	1.0
  (17789, 7)	1.0
  (18296, 7)	1.0
  (18581, 7)	1.0
  (22715, 7)	1.0
  (23166, 7)	1.0
  (25125, 7)	1.0
  (25147, 7)	1.0
  (26573, 7)	1.0
  (30216, 7)	1.0
  (31763, 7)	1.0
  (32313, 7)	1.0
  (32431, 7)	1.0
  (34055, 7)	1.0
  (17635, 8)	1.0
  (5410, 9)	1.0
  :	:
  (34733, 5145)	194.0
  (34734, 5145)	35.0
  (34735, 5145)	30.0
  (34736, 5145)	76.0
  (34737, 5145)	358.0
  (34738, 5145)	584.0
  (34739, 5145)	288.0
  (34740, 5145)	467.0
  (34741, 5145)	477.0
  (34742, 5145)	453.0
  (34743, 5145)	638.0
  (34744, 5145)	348.0
  (34745, 5145)	621.0
  (34746, 5145)	221.0
  (34747, 5145)	711.0
  (34748, 5145)	322.0
  (34749, 5145)	704.0
  (34750, 5145)	80.0
  (34751, 5145)	296.0
  (34752, 5145)	120.0
  (34753, 5145)	100.0
  (34754, 5145)	74.0
  (34755, 5145)	99.0
  (34756, 5145)	126.0
  (34757, 5145)	16.0


# Modeling

In [101]:
# WITH 1 FEATURE
X_Experimental = X_2.toarray()[:,-1:]
x_train, x_test, y_train, y_test = train_test_split(X_Experimental, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.4631477141289966

In [89]:
model = LinearRegression()
scores = cross_val_score(model, X_Experimental, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.32300821, -0.41824997, -0.46637291, -0.53849636, -0.59160542])

In [99]:
# WITH ALL FEATURES WITHOUT ONE HOT ENCODING
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.46484143154049606

In [100]:
model = LinearRegression()
scores = cross_val_score(model, X, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.87793422, -0.41748855, -0.46550401, -0.53786223, -0.59184744])

In [103]:
# WITH ALL FEATURES WITH ONE HOT ENCODING
# Train, test, split, calculate error
x_train, x_test, y_train, y_test = train_test_split(X_2, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.4812937520444667

In [96]:
model = LinearRegression()
scores = cross_val_score(model, X_2, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.69819627, -0.43945324, -0.47608438, -0.54572644, -0.61053225])

In [104]:
# WITH ALL FEATURES WITH ONE HOT ENCODING AND REGULARIZATION
# Train, test, split, calculate error
from sklearn import linear_model
x_train, x_test, y_train, y_test = train_test_split(X_2, y, test_size=0.2)
model = linear_model.Lasso(alpha=0.1)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.4713500866723761

In [105]:
from sklearn import linear_model
model = linear_model.Lasso(alpha=0.1)
scores = cross_val_score(model, X_2, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.68552913, -0.41783322, -0.46615665, -0.53830016, -0.59139369])

In [34]:
# Train model on training data
model = LinearRegression()
model.fit(X_2, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [35]:
# Make predictions
predictions = model.predict(X_2)

In [36]:
# Calculate statistics on training data
error = mean_squared_error(y, predictions)
w = model.coef_
b = model.intercept_
error, w, b

(0.3732065724339587,
 array([-9.15496863e-02, -3.39691589e-01, -8.18432723e-02, ...,
        -2.53025989e-05, -2.23592589e-05, -1.37581767e-06]),
 3.4842280913746477)

# Results

Features without location: 
error, w, b is (0.45079620978786505,
 array([ 0.00647003, -0.00115187, -0.00028681]),
 3.3909317414654145)

5-fold cross validation returns
array([-0.44058222, -0.46424376, -0.44806606, -0.44477971, -0.45780714])

 
All features: 
error, w, b is (0.3475201663575248,
 array([-8.63861463e-01, -1.59436776e-01,  7.42368542e-01, ...,
         6.88801126e-03, -1.46312267e-03, -2.93404668e-04]),
 3.3865073437888973)

5-fold cross validation returns
array([-0.49178862, -0.53366115, -0.52459808, -0.52376639, -0.53658507])