In [1]:
from input_data import *
import csv
from random import random
from sklearn.ensemble import RandomForestClassifier

In [2]:
def get_pins_map(file):
    '''
    Returns pins map
    key = pin_id
    value = pin_features
    '''
    pins = {}
    with open(file, 'rb') as f_pins:
        reader = csv.reader(f_pins, delimiter='|')
        for row in reader:
            pins[int(row[0])] = [float(x) for x in row[1:1015]]
    return pins

In [3]:
def get_users_map(file):
    '''
    Returns users map
    key = user_id
    value = user_features
    '''
    users = {}
    with open(file, 'rb') as f_users:
        reader = csv.reader(f_users, delimiter='|')
        for row in reader:
            users[int(row[0])] = [float(x) for x in row[1:]]
    return users

In [4]:
def get_repins(file):
    '''
    Returns all repins from given file
    Repin is represented as a tuple (user_id, pin_id)    
    '''
    repins = []
    with open(file, 'rb') as repins_file:
        reader = csv.reader(repins_file, delimiter='|')
        for row in reader:
            repins.append((int(row[0]), int(row[1])))
    return repins

In [5]:
pos_repins = get_repins(INPUT_REPINS)
neg_repins = get_repins(INPUT_NEGATIVE_REPINS_TRAIN)

In [6]:
m_pins = get_pins_map(INPUT_PINS)

In [7]:
m_users = get_users_map(INPUT_USERS)

In [8]:
x = []
y = []

for repin in pos_repins:
    if repin[0] not in m_users:
        continue
    if repin[1] not in m_pins:
        continue
        
    user_feature = m_users[repin[0]]
    pin_feature = m_pins[repin[1]]
    
    x.append(user_feature + pin_feature)
    y.append(1)

for repin in neg_repins:
    if repin[0] not in m_users:
        continue
    if repin[1] not in m_pins:
        continue
        
    user_feature = m_users[repin[0]]
    pin_feature = m_pins[repin[1]]
    
    x.append(user_feature + pin_feature)
    y.append(0)

In [9]:
print len(x), len(x[0])

135812 1051


In [10]:
clf = RandomForestClassifier(n_estimators=10, n_jobs=4)
clf.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
pos_repins = get_repins(INPUT_TEST_FILE)
print len(pos_repins), 'positive repins were loaded from test set'

neg_repins = get_repins(INPUT_NEGATIVE_REPINS_TEST)
print len(neg_repins), 'negative repins were loaded from test set'

27196 positive repins were loaded from test set
27082 negative repins were loaded from test set


In [12]:
x_test = []
y_test = []

for repin in pos_repins:
    if repin[0] not in m_users:
        continue
    if repin[1] not in m_pins:
        continue
    
    user_feature = m_users[repin[0]]
    pin_feature = m_pins[repin[1]]
    
    x_test.append(user_feature + pin_feature)
    y_test.append(1)
    
for repin in neg_repins:
    if repin[0] not in m_users:
        continue
    if repin[1] not in m_pins:
        continue
    
    user_feature = m_users[repin[0]]
    pin_feature = m_pins[repin[1]]
    
    x_test.append(user_feature + pin_feature)
    y_test.append(0)

In [13]:
y_hat = clf.predict(x_test)

In [17]:
print clf.score(x_test, y_test)

0.792162570471
0.792162570471


In [18]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_hat)

print 'Precision', precision
print 'Recall', recall
print 'Thresholds', thresholds

Precision [ 0.50105015  0.71068866  1.        ]
Recall [ 1.          0.98698338  0.        ]
Thresholds [0 1]
