In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.cross_validation import train_test_split
import random
import math
import scipy
import collections
from sklearn.metrics import confusion_matrix,accuracy_score

path = os.getcwd()

ADJ = pd.read_csv(path+'/ADJ.csv',index_col=0)
ADJ.columns = [int(col.strip('V')) for col in ADJ.columns]
click = pd.read_csv(path+'/click.csv',index_col=0)
click.columns = [int(col.strip('V')) for col in click.columns]
X_ad = pd.read_csv(path+'/X_ad.csv',index_col=0)
X_user = pd.read_csv(path+'/X_user.csv',index_col=0)

X_user = X_user[:100]
click = click[:100]
ADJ = ADJ.loc[1:100,1:100]

# normalize the demographics
X_user['age'] = X_user.apply(lambda x : (x[0] - np.min(X_user.age))/(np.max(X_user.age) - np.min(X_user.age)),axis=1)
X_user['edu'] = X_user.apply(lambda x : (x[2] - np.min(X_user.edu))/(np.max(X_user.edu) - np.min(X_user.edu)),axis=1)
X_user['income'] = X_user.apply(lambda x : (x[3] - np.min(X_user.income))/(np.max(X_user.income) - np.min(X_user.income)),axis=1)

### Sandbox

In [None]:
"""spits out the k nearest neighbours of the index(user) of an unweighted,undirected graph"""
k = 30
idx = 1 


def distance_matrix(size):
    '''returns the euclidean matrix between all pair users by (age,gender,income,degree of connection)'''
    dist_matrix = np.zeros((size,size))
    for k in range(0,size):
        for i in range(0,size):
            degree_of_connection = degree_mat[k+1][i+1]
            v1 = np.append(np.array(X_user.loc[k+1,:]),1)
            if degree_of_connection == 0:
                v2 = np.append(np.array(X_user.loc[i+1,:]),1.)
            else:
                v2 = np.append(np.array(X_user.loc[i+1,:]),degree_of_connection)
            dist_matrix[k][i] = scipy.spatial.distance.euclidean(v1, v2)
    return dist_matrix



def neighbour_set_dfs(idx):
    
    '''returns the connected graph of the candidate'''
    
    def user_graph():
        
        '''Returns the graph (adjacency list) of the user network'''
    
        graph = dict()
        
        for i in range(1,101):
            graph[i] = set(list(ADJ.columns[ADJ.loc[i,:] == 1]))
        return graph

    def dfs(graph, start, visited=None):
        
        '''Returns the total connected components (list) of the given user in his/her network'''
        '''credits: Eddmann'''
    
        if visited is None:
            visited = set()
        visited.add(start)
        for next in graph[start] - visited:
            dfs(graph, next, visited)
        return visited

    return dfs(user_graph(),idx)


In [None]:
degree_mat = pd.read_csv(path+'/dist_matrix.csv',header=None)
degree_mat.columns = [_ for _ in range(1,1001)]
degree_mat.index = np.arange(1, len(degree_mat) + 1)

In [None]:
distance_matrix_100 = distance_matrix(100)

In [None]:
click.head()

In [None]:
user_id = 1
ad_test = click[:user_id] # user 1 has viewd 84% of the ad's
ad_test = ad_test.dropna(axis=1)

In [None]:
ad_test

In [None]:
user_1_ad_ids = ad_test.columns

In [None]:
#neighbour_set_dfs(user_id)


def social_network(user_id,k):
    
    '''Returns the pruned social network which has the user_ids with highest predictive power'''

    unpruned = list(neighbour_set_dfs(user_id))
    sim_dict = dict()
    pruned = list()
    
    for i in range(len(unpruned)):
        sim_dict[unpruned[i]] = distance_matrix_100[user_id-1,unpruned[i]-1]
    
    sorted_sim_dict = sorted(sim_dict.items(), key=lambda x: (-x[1], x[0]),reverse=True)
    
    for _ in range(1,k+1):
        pruned.append(sorted_sim_dict[_][0])
        
    return pruned

#social_network(6,12) 

In [279]:
def prob_ad_click(k,ad_id,user_id):
    
    usr_soc_nw = social_network(user_id,k)
    
    def prune_by_viwership(lst):
        
        '''returns only those people in social network who viewd the ad'''
        pruned = list()
        for user in lst:
            if (click.loc[user,ad_id] == 1) or (click.loc[user,ad_id] == 0):
                pruned.append(user)
        return pruned
    
    usr_soc_nw = prune_by_viwership(usr_soc_nw)
    cnt_click = 0.
    cnt_noclick = 0.
    
    for user in usr_soc_nw:
        if click.loc[user,ad_id] == 1:
            cnt_click += 1
        else:
            cnt_noclick += 1
    
    prob = (cnt_click/(cnt_click+cnt_noclick+1.)) * 1.
    return prob

In [280]:
ad_test

Unnamed: 0,1,2,4,5,6,7,8,9,10,11,...,88,90,91,92,94,95,96,98,99,100
1,1,1,1,1,0,1,0,1,1,0,...,1,1,0,1,1,0,0,1,1,1


In [291]:
k = 7
result = list()

for ad_id in user_1_ad_ids:
    if prob_ad_click(k,ad_id,99) > 0.3:
        result.append(1)
    else:
        result.append(0)
        
print k,accuracy_score(ad_test.loc[1,:], result)

7 0.452380952381
