In [20]:
from input_data import *
import csv
from random import random
from sklearn.ensemble import RandomForestClassifier

In [21]:
def get_user_ids():
    '''
    Returns user ids from input data
    '''
    with open(INPUT_USERS, 'rb') as users:
        reader = csv.reader(users, delimiter='|')
        return [int(row[0]) for row in reader]

In [44]:
def get_repins(file):
    '''
    Returns all repins from given file
    '''
    repins = []
    with open(file, 'rb') as repins_file:
        reader = csv.reader(repins_file, delimiter='|')
        for row in reader:
            repins.append([int(x) for x in row])
    return repins

In [45]:
def get_repins_for_user(user_id, file):
    '''
    Returns repins of given user loaded from given file
    '''
    repins = []
    with open(file, 'rb') as repins_file:
        reader = csv.reader(repins_file, delimiter='|')
        for row in reader:
            if int(row[0]) == user_id:
                repins.append([int(x) for x in row])
    return repins

In [46]:
def get_pin_features_by_repins(repins):
    '''
    Returns quality and object features of pins,
    selected from repins (repin.anon_pin_id)
    '''
    pins_features = []
    pin_ids = [repin[1] for repin in repins]
    with open(INPUT_PINS, 'rb') as pins:
        reader = csv.reader(pins, delimiter='|')
        for row in reader:
            if int(row[0]) in pin_ids:
                features = [float(x) for x in row[1:1015]]
                pins_features.append(features)
    return pins_features

In [47]:
def get_pin_features_not_in_repins(repins):
    pins_features = []
    pin_ids = [repin[1] for repin in repins]
    with open(INPUT_PINS, 'rb') as pins:
        reader = csv.reader(pins, delimiter='|')
        for row in reader:
            if int(row[0]) not in pin_ids:
                features = [float(x) for x in row[1:1015]]
                pins_features.append(features)

            if len(pins_features) > 500:
                break

    return pins_features

In [40]:
users = get_user_ids()
user_id = users[int(random() * len(users))]

In [48]:
repins = get_repins_for_user(user_id, INPUT_TRAIN_FILE)

x = get_pin_features_by_repins(repins)
y = [1 for i in range(len(x))]

repins = get_repins_for_user(user_id, INPUT_REPINS)
x_n = get_pin_features_not_in_repins(repins)
y_n = [0 for i in range(len(x_n))]

x_all = x + x_n
y_all = y + y_n

print "OK"

OK


In [49]:
clf = RandomForestClassifier(n_estimators=10, n_jobs=4)
clf.fit(x_all, y_all)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
repins = get_repins_for_user(user_id, INPUT_TEST_FILE)
if len(repins) == 0:
    print "No repins were found in test set for given user"
    exit(-1)
    
print len(repins), 'repins were loaded from test set'

3 repins were loaded from test set


In [51]:
x_test = get_pin_features_by_repins(repins)
total = len(x_test)

if total == 0:
    print "No pins were loaded based on test set"
    exit(-2)
    
print total, 'pins were loaded based on repins from test set'

2 pins were loaded based on repins from test set


In [52]:
count = 0
for x_hat in x_test:
    y_hat = clf.predict(x_hat)
    
    if y_hat == 1:
        count += 1

print 'Precision =', count * 1. / total

Precision = 0.5
