# Facebook: Predicting Check Ins

### Group: Suman Tripathy, Joelynn Deng, AC Dela Paz, Tinh Nguyen

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression, RidgeClassifier
np.random.seed(30)

We'd first like to give credit to Jerry the TA for his implementation of gridding. By gridding our data, we can train faster and more efficiently to get a better idea of what $k$ to pick for kNN. 

Link to Original : (https://github.com/kaggledecal/kaggle_fa16/blob/master/code/day13/fb_validation_code.ipynb)



In [None]:
#APK Metric Function
def apk(actual, predicted, k=3):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

In [2]:
# Mean Average Precision at 3 (MAP@3) - Evaluation function
def mapAt3(true, predictions):
    """
        true - array of true (N x 1)
        predictions - 
    """
    N = len(true)
    s = 0
    for i in range(N):
        s += apAt3(true[i], predictions[i])
    return s/N

# Average Precision at 3 (ap@3)
def apAt3(actual, predicted):
    """
        actual - place_id
        predicted - list of recommended place_id's
        
        Calculates the average precision for one data point.
        The user only follows one node (m=1), so we will ignore the denominator.
    """
    s = 0
    for i, rec in enumerate(predicted):
        if rec == actual:
            s += 1.0/(i+1)
    return s
        

In [3]:
apAt3('200', ['100', '', '200'])

0.3333333333333333

In [4]:
#Read in the data
fb_train = pd.read_csv("data/train.csv")
fb_test = pd.read_csv("data/test.csv")
fb_train.head()

X = fb_train.drop(['place_id'], axis=1)
y = fb_train['place_id']


In [5]:
#Processing 
initial_date = np.datetime64('2014-01-01T01:01',   #Arbitrary 
                                 dtype='datetime64[m]') 
d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                           for mn in fb_train.time.values)   

In [6]:
fb_train['hour'] = d_times.hour 
fb_train['weekday'] = d_times.weekday
fb_train['day'] = d_times.day
fb_train['month'] = d_times.month
fb_train['year'] = d_times.year
fb_train.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour,weekday,day,month,year
0,0,0.7941,9.0809,54,470702,8523065625,22,6,23,11,2014
1,1,5.9567,4.7968,13,186555,1757726713,14,5,10,5,2014
2,2,8.3078,7.0407,74,322648,1137537235,2,2,13,8,2014
3,3,7.3665,2.5165,65,704587,6567393236,8,1,5,5,2015
4,4,4.0961,1.1307,31,472130,7440663949,21,0,24,11,2014


In [7]:
X = fb_train.drop(['time'], axis=1)
y = fb_train['place_id']
X = X.drop(['place_id'], axis = 1)

features = ['x','y','hour','day','weekday','month','year','accuracy','place_id']
#Test, train, & split by time
split_t = 730000
local_train_time = fb_train[fb_train.time < split_t][features]
local_test_time = fb_train[fb_train.time >= split_t][features]

#Simply use test_train_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [8]:
#Create our grid
# Credit to gridding implementation goes to Jerry
x_range = 0.5
y_range = 0.25

x_weights = 500
y_weights = 1000

xpoints = np.arange(0,x_weights*10,x_range*x_weights)
xpoints = np.append(xpoints,x_weights*10+x_range*x_weights)
ypoints = np.arange(0,y_weights*10,y_range*y_weights)
ypoints = np.append(ypoints,y_weights*10+y_range)
x_grid, y_grid = np.meshgrid(xpoints,ypoints)

In [None]:
X_train.head()

In [None]:
#Train on one grid using normal training set
score = 0.0
total = 0
cumulative_scores = []

i = 0
j = 0

subset = X_train[(X_train['x'] >= x_grid[i][j]) & (X_train['x'] < x_grid[i][j+1]) & \
                     (X_train['y'] >= y_grid[i][j]) & (X_train['y'] < y_grid[i+1][j])]
test_subset = X_test[(X_test['x'] >= x_grid[i][j]) & (X_test['x'] < x_grid[i][j+1]) & \
                         (X_test['y'] >= y_grid[i][j]) & (X_test['y'] < y_grid[i+1][j])]

In [None]:
#Train on one grid using time-based training set
time_score = 0.0
time_total = 0
time_cumulative_scores = []

i = 0
j = 0

time_subset = X_traintime[(X_traintime['x'] >= x_grid[i][j]) & (X_traintime['x'] < x_grid[i][j+1]) & \
                     (X_traintime['y'] >= y_grid[i][j]) & (X_traintime['y'] < y_grid[i+1][j])]
time_test_subset = X_testtime[(X_testtime['x'] >= x_grid[i][j]) & (X_testtime['x'] < x_grid[i][j+1]) & \
                         (X_testtime['y'] >= y_grid[i][j]) & (X_testtime['y'] < y_grid[i+1][j])]

In [None]:
def cv_stats(cv_score):
    """ 
    Takes in the output of cross_val_score
    Returns the mean and standard deviation in a readable format
    """
    mean = np.mean(cv_score)
    std = np.std(cv_score)
    return mean, std

# K Nearest Neighbors Model

Let's construct a KNN model. We'll try out using the standard train_test_split
and splitting by time. 

In [None]:
x = [1, 2, 3, 4, 5, 6]
x[-3:][::-1]

In [None]:
#Test on time-based training set

# Taken directly from Jerry's example with some changes (different eval func.)
#Iterate through all grids
score = 0.0
total = 0

xs = x_grid
ys = y_grid

for i in range(len(ypoints)-1):
	for j in range(len(xpoints)-1):
		subset = local_train_time[(local_train_time.x >= xs[i][j]) & (local_train_time.x < xs[i][j+1]) & \
						 (local_train_time.y >= ys[i][j]) & (local_train_time.y < ys[i+1][j])]
		test_subset = local_test_time[(local_test_time.x >= xs[i][j]) & (local_test_time.x < xs[i][j+1]) & \
						   	 (local_test_time.y >= ys[i][j]) & (local_test_time.y < ys[i+1][j])]
		if len(test_subset)==0:
			print('Moving on from j=%d' % j)
			continue

		if len(subset)==0:
			continue

		y = subset['place_id']
		clf = KNN(n_neighbors=40)
		clf.fit(subset[features[:-1]], y) # Now we exclude place_id
		all_preds = clf.predict_proba(test_subset[features[:-1]])
        
		for record in range(len(all_preds)):
			top3_idx = all_preds[record].argsort()[-3:][::-1]
			preds = clf.classes_[top3_idx]
			apk_score = apAt3([test_subset.place_id.iloc[record]],preds) # calculate ap@3
			score += apk_score
			total += 1
		print(j)
		print(score/total)
	print(i)

In [None]:


clf.fit(X_traintime, y_traintime)
all_time_predictions = clf.predict_proba(X_testtime)


# Linear Regression Model


In [None]:
# Initialize linear regression
clf = linear_model.LinearRegression()
clf.fit(X_train, y_train)


In [None]:
#Score Linear Regression based on normal training set

train_r1 = clf.score(X_train, y_train) # fill out linreg
test_r1 = clf.score(X_test, y_test)# fill out linreg

print("Train accuracy : ", train_r1)
print("Test accuracy : ", test_r1)

In [None]:
y_pred = clf.predict(X_test) # predict on test set

fig, ax = plt.subplots()
ax.scatter(y_test, y_pred)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=4)
ax.set_xlabel('Test')
ax.set_ylabel('Predicted')
plt.title('Comparison of Actual vs Predicted Values')
plt.show()

# Logistic Regression Model


In [None]:
# iterate through a range of penalty values and evaluate how well LogisticRegression does on average
penalty = 0.001
while penalty <= 10:
    clf = LogisticRegression(C = penalty)
    score = cross_val_score(clf, X_train, y_train, cv = 5)
    stats = cv_stats(score)
    print("For penalty={0}", penalty, "; Average={0}, STDEV={1}".format(*stats))
    penalty *= 10
    
# initialize and fit logistic regression
clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
train_r2 = clf.score(X_train, y_train) # score Logistic Regression on training set
test_r2 = clf.score(X_test, y_test) # score Logistic Regression on test set


print("Train accuracy : ", train_r2)
print("Test accuracy : ", test_r2)

# Ridge Regression Model


In [None]:
# iterate through a range of penalty values and evaluate how well RidgeClassifier does on average
penalty = 0.01
while penalty < 10:
    clf = RidgeClassifier(alpha = penalty)
    score = cross_val_score(clf, X_train, y_train, cv = 5)
    stats = cv_stats(score)
    print("For penalty = ", penalty, "Average, std = ", stats)
    penalty *= 10

# Best Model
## K Nearest Neighbors

In [None]:
# initialize KNN here
clf = KNN(40)
clf.fit(X_train, y_train)

train_acc = clf.score(X_train, y_train) # score KNN on train set
test_acc = clf.score(X_test, y_test) # score KNN on test set


print("Train accuracy : ", train_acc)
print("Test accuracy : ", test_acc)