In [1]:


import numpy as np
import pandas as pd
import pymysql as mdb
import sqlalchemy as sa

from decimal import *

import sexmachine.detector as gender
identify_gender = gender.Detector()

# load local sql engine
import pickle
import sys
import re
import psycopg2

from sqlalchemy_utils import database_exists, create_database

%matplotlib inline
%config InlineBackend.figure_format='retina'
pd.set_option('display.max_columns', 500)
local_weave_pair = sa.create_engine("postgres://%s@localhost/%s"%('rootname','weave_pair'))


In [2]:
from sklearn.ensemble import RandomForestClassifier

rfclf = RandomForestClassifier(n_estimators=10000, class_weight="auto")


In [3]:
## create a database (if it doesn't exist)
if not database_exists(local_weave_pair.url):
    create_database(local_weave_pair.url)
print(database_exists(local_weave_pair.url))

# connect:
#con = None
con = psycopg2.connect(database = 'weave_pair', user = 'jiongz')
#con = psycopg2.connect(database = dbname, user = username, password = passwd)


# query:
sql_query = """
SELECT * FROM training_set;
"""
training_set = pd.read_sql_query(sql_query,con)

True


# Get Rating Prediction Model

In [7]:
y = training_set.loc[:,'rating'].values
X = training_set.loc[:,['meeting_times_x','meeting_times_y', 'get_ratings_mean', 'match_scores', 'user_degrees', 'match_degrees', 'user_genders', 'match_genders']].values
rfclf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight='auto', criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Calculate meeting features for available users

In [8]:
# get user features query:
sql_query = """
SELECT * FROM user_features_combine;
"""
user_features = pd.read_sql_query(sql_query,con)
user_features.head(3)

Unnamed: 0,level_0,user_id,meeting_times,hood,title,looking_for,get_ratings_mean,ratings_mean,index,name,degree,start_yr,majors
0,0,307,12,soma market st north beach,{ceo},"{""investor founder""}",4.333333333333333,4.25,67,Everett Young,"B.A.,",2000,"{""Entrepreneurship/Entrepreneurial Studies"",""T..."
1,1,2183,11,,{founder},"{""""}",4.888888888888888,4.363636363636364,118,Brian Ma,"BS,",2004,"{""Computer Science, Electrical Engineering""}"
2,2,1081,10,,"{""softwar engin""}","{""""}",4.5,4.5,107,Amin Shaikh,"Sc.B.,",2007,"{""Computer Engineering""}"


In [9]:
user_features_com=user_features.dropna()

In [10]:
user_id_set = user_features_com['user_id'].values

In [11]:
user_id_set.astype('int');

In [12]:
available_user_ids = [217, 18119, 17745, 16570,  1286, 17668, 18096, 18182,  1282,
       17738, 18184, 18095, 17743, 18835,   137, 18863,   388, 17495,
         422,   399,   100, 18925, 16512, 18903,   474, 18902, 18818,
       18898,   475, 18896, 18908, 18544,   113, 18917,   127, 18875,
           2, 18500, 18612, 18613]

In [24]:
## building features for each pair and calculate pred_rating in meeting score matrix
## This is the time-consuming part. It predicts ratings for all possible pairs.
## rating_user in rows; match_user in 
## ['meeting_times_x','meeting_times_y', 'get_ratings_mean', 'match_scores', 'user_degrees', 'match_degrees', 'user_genders', 'match_genders']

meeting_score_matrix = pd.DataFrame(index=available_user_ids, columns=available_user_ids)
degree_dict = {'b':1, 'm':2, 'p':3}
gender_dict = {'female':0, 'mostly_female':0, 'male':1, 'mostly_male':1, 'andy':2}

for user_1 in available_user_ids:
    for user_2 in available_user_ids:
        if user_1 == user_2:
            # meeting rating 0 if same user
            meeting_score_matrix.set_value(user_1, user_2, 0)
        else:
            ############################ meeting features ###############################
            meeting_times_x = user_features_com[user_features_com['user_id'] == user_1]['meeting_times'].values[0].astype('int')
            meeting_times_y = user_features_com[user_features_com['user_id'] == user_2]['meeting_times'].values[0].astype('int')
            get_ratings_mean= user_features_com[user_features_com['user_id'] == user_1]['get_ratings_mean'].values[0]
            # calculate match score
            demands = set(user_features_com.loc[user_features_com['user_id'] == user_1,'looking_for'].values[0][0].split())
            supplies = set(user_features_com.loc[user_features_com['user_id']== user_2,'title'].values[0][0].split())
            if not demands:
                match_score = 1
            else:
                if demands.intersection(supplies):
                    match_score = 2
                else:
                    match_score = 0
            # user1 degree
            if user_features_com.loc[user_features_com['user_id'] == user_1,'degree'].values:
                user_degree = user_features_com.loc[user_features_com['user_id'] == user_1,'degree'].values[0][0].lower()
                if user_degree in degree_dict:
                    user_degree = degree_dict[user_degree]
                else:
                    user_degree = 0
            else:
                user_degree = 0
            # match degree
            if user_features_com.loc[user_features_com['user_id'] == user_2,'degree'].values:
                match_degree = user_features_com.loc[user_features_com['user_id'] == user_2,'degree'].values[0][0].lower()
                if match_degree in degree_dict:
                    match_degree = degree_dict[match_degree]
                else:
                    match_degree = 0
            else:
                match_degree = 0
        
            # get genders
            # user_gender
            user_name = user_features_com.loc[user_features_com['user_id'] == user_1,'name'].values[0].split()[0]
            user_gender = gender_dict[identify_gender.get_gender(user_name)]
            # match_gender
            match_name = user_features_com.loc[user_features_com['user_id'] == user_2,'name'].values[0].split()[0]
            match_gender = gender_dict[identify_gender.get_gender(match_name)]
            
            ################# predict meeting rating ####################### 
            user1_rate_user2 = rfclf.predict([meeting_times_x, meeting_times_y, get_ratings_mean, match_score, user_degree, match_degree, user_gender, match_gender])
            meeting_score_matrix.set_value(user_1, user_2, user1_rate_user2[0])

In [26]:
meeting_score_matrix[:3]

Unnamed: 0,217,18119,17745,16570,1286,17668,18096,18182,1282,17738,18184,18095,17743,18835,137,18863,388,17495,422,399,100,18925,16512,18903,474,18902,18818,18898,475,18896,18908,18544,113,18917,127,18875,2,18500,18612,18613
217,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
18119,5.0,0.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
17745,5.0,5.0,0.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,5.0,4.0,4.0,4.0,4.0,4.0,5.0


In [39]:
# Select 5 star pairs
good_matches = []
for i in xrange(len(available_user_ids)):
    for j in xrange(i+1, len(available_user_ids)):
        rating1 = float(meeting_score_matrix[available_user_ids[i]][available_user_ids[j]])
        rating2 = float(meeting_score_matrix[available_user_ids[j]][available_user_ids[i]])
        if rating1+rating2 == 10:
            good_matches.append((available_user_ids[i], available_user_ids[j]))


In [40]:
print good_matches

[(217, 18119), (217, 17745), (217, 16570), (217, 1286), (217, 17668), (217, 18096), (217, 18182), (217, 17738), (217, 18184), (217, 18095), (217, 18863), (217, 17495), (217, 422), (217, 100), (217, 16512), (217, 18903), (217, 474), (217, 18902), (217, 18818), (217, 18898), (217, 475), (217, 18908), (217, 18544), (217, 113), (217, 18917), (217, 127), (217, 18875), (217, 2), (217, 18500), (217, 18612), (217, 18613), (18119, 17745), (18119, 16570), (18119, 1286), (18119, 18096), (18119, 17738), (18119, 18184), (18119, 18095), (18119, 18863), (18119, 422), (18119, 100), (18119, 18903), (18119, 474), (18119, 18902), (18119, 18818), (18119, 18898), (18119, 18908), (18119, 18544), (18119, 113), (18119, 18917), (18119, 18875), (18119, 2), (18119, 18500), (18119, 18612), (18119, 18613), (17745, 16570), (17745, 1286), (17745, 17738), (17745, 100), (17745, 18818), (17745, 18908), (17745, 113), (17745, 18917), (17745, 18613), (16570, 1286), (16570, 17738), (16570, 18184), (16570, 18095), (16570, 1