In [35]:
import sys
import sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import csv

# Data Cleaning

In [8]:
# Helper functions
def convert_si_to_number(x):
    total_stars = 0
    if 'k' in x:
        if len(x) > 1:
            total_stars = float(x.replace('k', '')) * 1000 # convert k to a thousand
    elif 'M' in x:
        if len(x) > 1:
            total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
    elif 'B' in x:
        total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
    else:
        total_stars = int(x) # Less than 1000
    
    return int(total_stars)

In [49]:
# Read file
file = open("../data/merged.csv")
reader = csv.reader(file)
i = 0
y = []
X = []
names = []
next(reader)
for row in reader:
    name = row[1]
    industry = row[4]
    size_range = row[5]
    location = row[6].split(",")
    city = location[0].strip()
    state = location[1].strip()
    currentEmployeeEstimate = row[9]
    totalEmployeeEstimate = row[10]
    score = row[11]
    reviews = row[12]
    salaries = row[13]
    interviews = row[14]
    reviews = 0 if reviews == "--" else reviews
    salaries = 0 if salaries == "--" else salaries
    interviews = 0 if interviews == "--" else interviews
    
    i = i + 1
    
    # Add names
    names.append(name)
    
    # Add y labels
    y.append(score)
    
    # Add x features
    X.append([industry, size_range, city, state, currentEmployeeEstimate, totalEmployeeEstimate, reviews, salaries, interviews])

In [50]:
X[0]

['telecommunications',
 '10001+',
 'dallas',
 'texas',
 '115188',
 '269659',
 '23000.0',
 '29000.0',
 '3900.0']

In [51]:
# Convert to pandas dataframe
X = np.array(X)
y = np.array(y)
X = pd.DataFrame({'Industry': X[:, 0], 'Size_Range': X[:, 1], 'City': X[:, 2], 'State': X[:, 3], 'CurrentEmployeeEstimate': X[:, 4], 'TotalEmployeeEstimate': X[:, 5], 'Reviews': X[:, 6], 'Salaries': X[:, 7], 'Interviews': X[:, 8]})

In [52]:
X.shape
X

Unnamed: 0,City,CurrentEmployeeEstimate,Industry,Interviews,Reviews,Salaries,Size_Range,State,TotalEmployeeEstimate
0,dallas,115188,telecommunications,3900.0,23000.0,29000.0,10001+,texas,269659
1,atlanta,44630,retail,4800.0,22000.0,27000.0,10001+,georgia,114799
2,richfield,29568,retail,4800.0,20000.0,27000.0,10001+,minnesota,120812
3,woonsocket,33408,pharmaceuticals,3200.0,16000.0,23000.0,10001+,rhode island,70569
4,hopewell,1,semiconductors,3000.0,15000.0,21000.0,1 - 10,virginia,2
5,ayot saint lawrence,0,information technology and services,12000.0,34000.0,78000.0,1 - 10,hertford,1
6,edinburgh,1,information technology and services,2700.0,13000.0,14000.0,1 - 10,edinburgh,2
7,seattle,20376,retail,3600.0,12000.0,11000.0,10001+,washington,81777
8,lakewood,1,graphic design,2900.0,12000.0,17000.0,1 - 10,ohio,3
9,cincinnati,22257,retail,2000.0,9500.0,13000.0,10001+,ohio,52584


In [53]:
# Label Encoder
le = LabelEncoder()
categorical_feature_mask = X.dtypes==object
categorical_cols = X.columns[[0, 2, 6, 7]].tolist()
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

In [54]:
# One hot encoding
# enc = preprocessing.OneHotEncoder()
# enc.fit(X_2)
# X_3 = enc.transform(X_2).toarray()
# X_3.shape
ohe = OneHotEncoder(categorical_features=[0, 2, 6, 7])
X_2 = ohe.fit_transform(X)

In [55]:
# Convert to floats
X_2 = X_2.astype(np.float)
y = y.astype(np.float)

# Modeling

In [88]:
# WITH 1 FEATURE
X_Experimental = X_2.toarray()[:,-1:]
x_train, x_test, y_train, y_test = train_test_split(X_Experimental, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.4841035250234285

In [97]:
model = LinearRegression()
scores = cross_val_score(model, X_Experimental, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.33232444, -0.42807937, -0.47367297, -0.55220785, -0.59517431])

In [91]:
# WITH ALL FEATURES WITHOUT ONE HOT ENCODING
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.4763171234862033

In [96]:
model = LinearRegression()
scores = cross_val_score(model, X, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.49720751, -0.42665251, -0.47254784, -0.5511474 , -0.59536158])

In [93]:
# WITH ALL FEATURES WITH ONE HOT ENCODING
# Train, test, split, calculate error
x_train, x_test, y_train, y_test = train_test_split(X_2, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.4920444443083271

In [98]:
model = LinearRegression()
scores = cross_val_score(model, X_2, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.68291323, -0.44714807, -0.48660817, -0.56231347, -0.62312421])

In [99]:
# Train model on training data
model = LinearRegression()
model.fit(X_2, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [100]:
# Make predictions
predictions = model.predict(X_2)

In [101]:
# Calculate statistics on training data
error = mean_squared_error(y, predictions)
w = model.coef_
b = model.intercept_
error, w, b

(0.37454661213751883,
 array([ 3.24238725e-02, -1.66322363e-02, -1.21891898e+00, ...,
         1.02374571e-05, -8.75947634e-05, -5.18034506e-06]),
 3.4861874147671927)

# Results

Features without location: 
error, w, b is (0.45079620978786505,
 array([ 0.00647003, -0.00115187, -0.00028681]),
 3.3909317414654145)

5-fold cross validation returns
array([-0.44058222, -0.46424376, -0.44806606, -0.44477971, -0.45780714])

 
All features: 
error, w, b is (0.3475201663575248,
 array([-8.63861463e-01, -1.59436776e-01,  7.42368542e-01, ...,
         6.88801126e-03, -1.46312267e-03, -2.93404668e-04]),
 3.3865073437888973)

5-fold cross validation returns
array([-0.49178862, -0.53366115, -0.52459808, -0.52376639, -0.53658507])