In [1]:
import sys
import sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import csv

# Data Cleaning

In [2]:
# Helper functions
def convert_si_to_number(x):
    total_stars = 0
    if 'k' in x:
        if len(x) > 1:
            total_stars = float(x.replace('k', '')) * 1000 # convert k to a thousand
    elif 'M' in x:
        if len(x) > 1:
            total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
    elif 'B' in x:
        total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
    else:
        total_stars = int(x) # Less than 1000
    
    return int(total_stars)

In [3]:
# Read file
filename = "30000-40000.csv"
file = open("../data/30000-40000.csv")
reader = csv.reader(file)
i = 0
y = []
X = []
names = []
for row in reader:
    if len(row) < 6: # exclude data missing fields
        continue
    name = row[0]
    location = row[1]
    score = row[2]
    reviews = row[3]
    salaries = row[4]
    interviews = row[5]
    if len(location.split(",")) < 2: # exclude data missing states or cities
        continue
    
    names.append(name)
        
    # X features
    feature = []
    city = location.split(",")[0]
    state = location.split(",")[1]
    feature.append(city)
    feature.append(state)
    feature.append(convert_si_to_number(reviews))
    if salaries == '--':
        feature.append(0)
    else:
        feature.append(convert_si_to_number(salaries))
    if interviews == '--':
        feature.append(0)
    else:
        feature.append(convert_si_to_number(interviews))
    X.append(feature)

    # y labels
    y.append(score)
    
    i = i + 1

In [4]:
# Convert to pandas dataframe
X = np.array(X)
y = np.array(y)
X = pd.DataFrame({'City': X[:, 0], 'State': X[:, 1], 'Reviews': X[:, 2], 'Salaries': X[:, 3], 'Interviews': X[:, 4]})
# X = pd.DataFrame({'Reviews': X[:, 2], 'Salaries': X[:, 3], 'Interviews': X[:, 4]})

In [30]:
# Drop rows with NA fields
X = X.dropna()
X.shape
X

Unnamed: 0,City,Interviews,Reviews,Salaries,State
0,Tokyo,10,46,21,Japan
1,Hamilton,9,46,39,Bermuda
2,San Jose,15,45,42,CA
3,Spokane Valley,3,45,107,WA
4,Ashburn,18,45,88,VA
5,Houston,9,45,135,TX
6,Austin,15,45,28,TX
7,Woburn,17,44,877,MA
8,Fort Lauderdale,5,44,55,FL
9,Horsham,27,44,40,United Kingdom


In [31]:
# Convert to numbers
le = preprocessing.LabelEncoder()
X_2 = X.apply(le.fit_transform)
enc = preprocessing.OneHotEncoder()
enc.fit(X_2)
onehotlabels = enc.transform(X_2).toarray()
onehotlabels.shape

Unnamed: 0,City,Interviews,Reviews,Salaries,State
0,2206,2,162,105,45
1,890,116,162,187,10
2,1945,21,158,195,12
3,2095,56,158,11,106
4,85,32,158,282,102
5,986,116,158,42,96
6,107,21,158,142,96
7,2445,29,156,281,53
8,748,82,156,227,24
9,983,51,156,192,101


In [8]:
# One hot encoding
enc = preprocessing.OneHotEncoder()
enc.fit(X_2)
X_3 = enc.transform(X_2).toarray()
X_3.shape

(12051, 3259)

In [19]:
sum(X_3[4])

5.0

In [287]:
# Convert to floats
X_3 = X_3.astype(np.float)
y = y.astype(np.float)

# Modeling

In [288]:
# Test model using 5 fold cross validation
model = LinearRegression()
scores = cross_val_score(model, X_3, y, cv=5, scoring = "neg_mean_squared_error") # also use R^2
scores

array([-7.18030106e+24, -6.73594703e+24, -1.13052612e+25, -2.21435168e+24,
       -3.46957127e+24])

In [289]:
# Train model on training data
model = LinearRegression()
model.fit(X_3, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [290]:
# Make predictions
predictions = model.predict(X_3)

In [291]:
# Calculate statistics on training data
error = mean_squared_error(y, predictions)
w = model.coef_
b = model.intercept_
error, w, b

(0.3102284362023129,
 array([ 8.50261952e+11, -5.50221018e+11, -5.50221018e+11, ...,
         1.75720673e+12,  1.75720673e+12,  1.75720673e+12]),
 -12612990674079.588)

# Results

Non-location features: 
error, w, b is (0.45079620978786505,
 array([ 0.00647003, -0.00115187, -0.00028681]),
 3.3909317414654145)

5-fold cross validation returns
array([-0.44058222, -0.46424376, -0.44806606, -0.44477971, -0.45780714])

 
All features: 
error, w, b is (0.3102284362023129,
 array([ 8.50261952e+11, -5.50221018e+11, -5.50221018e+11, ...,
         1.75720673e+12,  1.75720673e+12,  1.75720673e+12]),
 -12612990674079.588)

5-fold cross validation returns
array([-7.18030106e+24, -6.73594703e+24, -1.13052612e+25, -2.21435168e+24,
       -3.46957127e+24])

# Experimental (ignore)

In [99]:
import pandas as pd
X = pd.read_csv("../data/30000-40000.csv")


(12053, 6)

In [100]:
le = preprocessing.LabelEncoder()
X_2 = X.apply(le.fit_transform)
enc = preprocessing.OneHotEncoder()
enc.fit(X_2)
onehotlabels = enc.transform(X_2).toarray()
onehotlabels.shape

(12053, 15297)

In [124]:
data = np.array([[5.8, 2.8], [6.0, 2.2]])
dataset = pd.DataFrame({'Column1': data[:, 0], 'Column2': data[:, 1]})
dataset

Unnamed: 0,Column1,Column2
0,5.8,2.8
1,6.0,2.2
