In [98]:
import sys
import sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import csv

# Data Cleaning

In [22]:
# Helper functions
def convert_si_to_number(x):
    total_stars = 0
    if 'k' in x:
        if len(x) > 1:
            total_stars = float(x.replace('k', '')) * 1000 # convert k to a thousand
    elif 'M' in x:
        if len(x) > 1:
            total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
    elif 'B' in x:
        total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
    else:
        total_stars = int(x) # Less than 1000
    
    return int(total_stars)

In [23]:
# Read file
filename = "30000-40000.csv"
file = open("../data/30000-40000.csv")
reader = csv.reader(file)
i = 0
y = []
X = []
names = []
for row in reader:
    if len(row) < 6: # exclude data missing fields
        continue
    name = row[0]
    location = row[1]
    score = row[2]
    reviews = row[3]
    salaries = row[4]
    interviews = row[5]
    if len(location.split(",")) < 2: # exclude data missing states or cities
        continue
    
    names.append(name)
        
    # X features
    feature = []
    city = location.split(",")[0]
    state = location.split(",")[1]
    feature.append(city)
    feature.append(state)
    feature.append(convert_si_to_number(reviews))
    if salaries == '--':
        feature.append(0)
    else:
        feature.append(convert_si_to_number(salaries))
    if interviews == '--':
        feature.append(0)
    else:
        feature.append(convert_si_to_number(interviews))
    X.append(feature)

    # y labels
    y.append(score)
    
    i = i + 1

In [24]:
# Convert to pandas dataframe
X = np.array(X)
y = np.array(y)
X = pd.DataFrame({'City': X[:, 0], 'State': X[:, 1], 'Reviews': X[:, 2], 'Salaries': X[:, 3], 'Interviews': X[:, 4]})

In [32]:
# Drop rows with NA fields
X = X.dropna()
X.shape

(12051, 5)

In [79]:
# Label Encoder
categorical_feature_mask = X.dtypes==object
categorical_cols = X.columns[[0, 4]].tolist()
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))

In [93]:
# One hot encoding
# enc = preprocessing.OneHotEncoder()
# enc.fit(X_2)
# X_3 = enc.transform(X_2).toarray()
# X_3.shape
ohe = OneHotEncoder(categorical_features=[0, 4])
X_2 = ohe.fit_transform(X)

In [94]:
# Convert to floats
X_2 = X_2.astype(np.float)
y = y.astype(np.float)

# Modeling

In [125]:
# WITH NON-LOCATION DATA
X_Experimental = X_2.toarray()[:,-3:]
x_train, x_test, y_train, y_test = train_test_split(X_Experimental, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.4601820284546266

In [124]:
# WITH LOCATION DATA
# Train, test, split, calcualte error
x_train, x_test, y_train, y_test = train_test_split(X_2, y, test_size=0.2)
model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
error = mean_squared_error(y_test, predictions)
error

0.5193406921464706

In [107]:
# Test model using 5 fold cross validation
model = LinearRegression()
scores = cross_val_score(model, X_2, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-0.49178862, -0.53366115, -0.52459808, -0.52376639, -0.53658507])

In [84]:
# Train model on training data
model = LinearRegression()
model.fit(X_2, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [88]:
# Make predictions
predictions = model.predict(X_2)

In [86]:
# Calculate statistics on training data
error = mean_squared_error(y, predictions)
w = model.coef_
b = model.intercept_
error, w, b

(0.3475201663575248,
 array([-8.63861463e-01, -1.59436776e-01,  7.42368542e-01, ...,
         6.88801126e-03, -1.46312267e-03, -2.93404668e-04]),
 3.3865073437888973)

# Results

Features without location: 
error, w, b is (0.45079620978786505,
 array([ 0.00647003, -0.00115187, -0.00028681]),
 3.3909317414654145)

5-fold cross validation returns
array([-0.44058222, -0.46424376, -0.44806606, -0.44477971, -0.45780714])

 
All features: 
error, w, b is (0.3475201663575248,
 array([-8.63861463e-01, -1.59436776e-01,  7.42368542e-01, ...,
         6.88801126e-03, -1.46312267e-03, -2.93404668e-04]),
 3.3865073437888973)

5-fold cross validation returns
array([-0.49178862, -0.53366115, -0.52459808, -0.52376639, -0.53658507])