In [1]:
from input_data import *
import csv
from random import random
from sklearn.ensemble import RandomForestClassifier

In [2]:
def get_pins_map(file):
    '''
    Returns pins map
    key = pin_id
    value = pin_features
    '''
    pins = {}
    with open(file, 'rb') as f_pins:
        reader = csv.reader(f_pins, delimiter='|')
        for row in reader:
            pins[int(row[0])] = [float(x) for x in row[1:1015]]
    return pins

In [3]:
def get_users_map(file):
    '''
    Returns users map
    key = user_id
    value = user_features
    '''
    users = {}
    with open(file, 'rb') as f_users:
        reader = csv.reader(f_users, delimiter='|')
        for row in reader:
            users[int(row[0])] = [float(x) for x in row[1:]]
    return users

In [4]:
def get_repins(file):
    '''
    Returns all repins from given file
    Repin is represented as a tuple (user_id, pin_id)    
    '''
    repins = []
    with open(file, 'rb') as repins_file:
        reader = csv.reader(repins_file, delimiter='|')
        for row in reader:
            repins.append((int(row[0]), int(row[1])))
    return repins

In [5]:
pos_repins = get_repins(INPUT_REPINS)
neg_repins = get_repins(INPUT_NEGATIVE_REPINS_TRAIN)

In [6]:
m_pins = get_pins_map(INPUT_PINS)

In [7]:
m_users = get_users_map(INPUT_USERS)

In [8]:
x = []
y = []

for repin in pos_repins:
    if repin[0] not in m_users:
        continue
    if repin[1] not in m_pins:
        continue
        
    user_feature = m_users[repin[0]]
    pin_feature = m_pins[repin[1]]
    
    x.append(user_feature + pin_feature)
    y.append(1)

for repin in neg_repins:
    if repin[0] not in m_users:
        continue
    if repin[1] not in m_pins:
        continue
        
    user_feature = m_users[repin[0]]
    pin_feature = m_pins[repin[1]]
    
    x.append(user_feature + pin_feature)
    y.append(0)

In [None]:
print len(x), len(x[0])

In [None]:
clf = RandomForestClassifier(n_estimators=10, n_jobs=4)
clf.fit(x, y)

In [9]:
test_pos_repins = get_repins(INPUT_TEST_FILE)
print len(pos_repins), 'positive repins were loaded from test set'

test_neg_repins = get_repins(INPUT_NEGATIVE_REPINS_TEST)
print len(neg_repins), 'negative repins were loaded from test set'

81447 positive repins were loaded from test set
54365 negative repins were loaded from test set


In [10]:
x_test = []
y_test = []

for repin in test_pos_repins:
    if repin[0] not in m_users:
        continue
    if repin[1] not in m_pins:
        continue
    
    user_feature = m_users[repin[0]]
    pin_feature = m_pins[repin[1]]
    
    x_test.append(user_feature + pin_feature)
    y_test.append(1)
    
for repin in test_neg_repins:
    if repin[0] not in m_users:
        continue
    if repin[1] not in m_pins:
        continue
    
    user_feature = m_users[repin[0]]
    pin_feature = m_pins[repin[1]]
    
    x_test.append(user_feature + pin_feature)
    y_test.append(0)

In [13]:
y_hat = clf.predict(x_test)

In [19]:
print clf.score(x_test, y_test)

0.792162570471


In [18]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_hat)

print 'Precision', precision
print 'Recall', recall
print 'Thresholds', thresholds

Precision [ 0.50105015  0.71068866  1.        ]
Recall [ 1.          0.98698338  0.        ]
Thresholds [0 1]


In [None]:
tree_sizes = [10, 50, 100, 500, 1000]
import time

learning_times = []
predicting_times = []
scores = []
precisions = []
recalls = []
thresholds = []
for tree_size in tree_sizes:
    print 'No of trees', tree_size
    t_start = time.time()
    clf = RandomForestClassifier(n_estimators=tree_size, n_jobs=4)
    clf.fit(x, y)
    t_end = time.time()
    print 'Training time', t_end - t_start
    
    learning_times.append(t_end-t_start)
    
    p_start = time.time()
    y_hat = clf.predict(x_test)
    p_end = time.time()
    predicting_times.append(p_end-p_start)
    print 'Prediction time', p_end - p_start
    
    from sklearn.metrics import precision_recall_curve
    from sklearn.metrics import average_precision_score

    score = clf.score(x_test, y_test)
    print 'Score:', score
    avg_precision = average_precision_score(y_test, y_hat)
    print 'Avg precision', avg_precision
    precision, recall, threshold = precision_recall_curve(y_test, y_hat)
    print 'Precision', precision
    print 'Recall', recall
    print 'Threshold', threshold
    
    scores.append(score)
    precisions.append(precision)
    recalls.append(recall)
    thresholds.append(threshold)
    print '-------------------------------------------------'
    

No of trees 10
Training time 60.1246039867
Prediction time 4.05399394035
Score: 0.792328383507
Avg precision 0.852253604417
Precision [ 0.50105015  0.71063492  1.        ]
Recall [ 1.          0.98771878  0.        ]
Threshold [0 1]
-------------------------------------------------
No of trees 50


In [None]:
print 'Training set', (len(x), len(x[0]))
print 'Test set', (len(x_test), len(x_test[0]))

In [None]:
import matplotlib.pyplot as plt

plt.plot(tree_sizes, scores)