In [1012]:
import pandas as pd

df_users = pd.read_csv("Mldata/users.csv")
df_posts = pd.read_csv("Mldata/posts.csv")
df_views = pd.read_csv("Mldata/views.csv")

In [1029]:
df_posts.columns = ['_id' , 'title' , 'category' , 'ptype']
print(df_users.describe())
print(df_posts.describe())
print(df_views.describe())


                             _id            name gender      academics
count                        118             118    118            118
unique                       118             118      3              3
top     5d60098a653a331687083238  Fauziya Shaikh   male  undergraduate
freq                           1               1     72             68
                             _id             title     category    ptype
count                        493               493          465      493
unique                       493               477          231        4
top     5ecb7155eaff6b0c3a58a486  PENCIL RENDERING  Photography  artwork
freq                           1                 3           81      241
                         user_id                   post_id  \
count                       1449                      1449   
unique                       118                       495   
top     5d60098a653a331687083238  5ec1fd0974f7660d73aa0fd5   
freq                         230

In [1030]:
user_dictionary = {}
for i in range(len(df_users['_id'])):
    user_dictionary[df_users['_id'][i]] = i
    
posts_dictionary = {}
for i in range(len(df_posts)):
    posts_dictionary[df_posts['_id'][i]] = i


In [1031]:
item_features = set()
for i in range(len(df_posts)):
    if(pd.notnull(df_posts['category'][i])):
        diff= df_posts['category'][i].split('|')
        #print(diff)
        for spec in diff:
            item_features.add(spec)

item_features.add('blog')
item_features.add('artwork')
item_features.add('project')
item_features.add('skill')
print(len(item_features))

238


In [1032]:
import numpy as np
features = np.array(list(item_features))

In [1033]:
features_dictionary = {}
for i in range(len(features)):
    features_dictionary[features[i]] = i


In [1034]:
X = np.zeros((len(df_posts)  , len(features)))
 
for i in range(len(df_posts)):
    if(pd.notnull(df_posts['category'][i])):
        diff= df_posts['category'][i].split('|')
        for spec in diff:
            X[i][features_dictionary[spec]] += 1
    
    diff = df_posts['title'][i].split(' ')
    for spec in diff:
        if spec in item_features:
            X[i][features_dictionary[spec]] +=1
    X[i][features_dictionary[df_posts['ptype'][i]]] += 1

print(np.sum(X))

1431.0


In [1035]:
theta = np.zeros((len(df_users) , len(features)+1))
X = np.insert(X , 0 , values = 1, axis =1)

In [1036]:
Y = np.zeros((len(df_posts) , len(df_users)))
for i in range(len(df_views)):
    user  = df_views['user_id'][i]
    post = df_views['post_id'][i]
    if post in posts_dictionary:
        Y[posts_dictionary[post]][user_dictionary[user]] =10
print(np.sum(Y))

13950.0


In [1037]:
print(Y.shape)
print(X.shape)
print(theta.shape)
number_users = Y.shape[1]
number_posts = Y.shape[0]
row_sums = X.sum(axis=1)
X = X / row_sums[:, numpy.newaxis]


(493, 118)
(493, 239)
(118, 239)


In [1038]:
def compute_cost(X , Y , theta):
    J =0
    for j in range(number_users):
        for i in range(number_posts):
            if(Y[i][j]):
                x = X[i , :]
                x.resize(X.shape[1] , 1)
                the = theta[j , :]
                the.resize(theta.shape[1] , 1)
                J += ((np.dot(the.T , x) - Y[i][j]) ** 2)/2
    
    return J

def gradient_descent(alpha , num_iterations , X , Y , theta):
    m = Y.shape[0]
    number_users = Y.shape[1]
    number_posts = Y.shape[0]
    for k in range(num_iterations):
        for j in range(number_users):
            t = np.zeros((X.shape[1]  , 1))
            for i in range(number_posts):
                if(Y[i][j]):
                    x = X[i , :]
                    x.resize(X.shape[1] , 1)
                    the = theta[j , :]
                    the.resize(theta.shape[1] , 1)
                    t += (np.dot(the.T , x) - Y[i][j]) * x
            tj = theta[j ,:]
            tj.resize(theta.shape[1] , 1)
            tj = tj -  alpha*(t)
            theta[j, :] = tj.reshape(theta.shape[1])
            
        if k%100 == 0:
            print(k , compute_cost(X , Y , theta))
        

In [1041]:
print(compute_cost(X , Y  , theta))
gradient_descent(0.01 , 4500 , X ,  Y , theta)
print(compute_cost(X , Y  , theta))

[[61.82890985]]
0 [[61.79555397]]
100 [[58.58448281]]
200 [[55.60365172]]
300 [[52.83102161]]
400 [[50.24726838]]
500 [[47.8353746]]
600 [[45.5802926]]
700 [[43.46866503]]
800 [[41.48859166]]
900 [[39.62943388]]
1000 [[37.88165001]]
1100 [[36.23665585]]
1200 [[34.68670612]]
1300 [[33.22479334]]
1400 [[31.84456102]]
[[30.55291435]]


In [1067]:
def suggest_posts(user_index):
    #user_index = user_dictionary[user]
    theta_user = theta[user_index , :]
    theta_user.reshape(theta.shape[1] , 1)
    already_seen = []
    recommended = {}
    for i in range(number_posts):
        X_post = X[i,:]
        X_post.reshape(X.shape[1] , 1)
        
        if(Y[i][user_index] == 10):
            already_seen.append(i)
        else:
            recommended[i] = abs(10 - np.dot(theta_user.T , X_post))
    
    recommended_list = sorted(recommended.items() , key=lambda x: x[1])
    print("Already seen posts: ")
    for i in range(len(already_seen)):
        print(df_posts['category'][already_seen[i]])
    
    print("")
    print("")
    print("Recommended: ")
    for i in range(len(recommended_list)):
        if(i>=10):
            break
        print(df_posts['category'][recommended_list[i][0]])

            

In [1084]:
suggest_posts(28)

Already seen posts: 
Photography
Photography
Photography
Photography


Recommended: 
Photography
Photography
Photography
Photography
Photography
Photography
Photography
Photography
Photography
Photography


In [1001]:
def suggest_related_posts(post_index):
    #post_index = post_dictionary[post_index]
    this_vector = X[post_index , :]
    this_vector.reshape(X.shape[1] , 1)
    recommended = {}
    print(df_posts['category'][post_index])
    print()
    for i in (range(number_posts)):
        that_vector = X[i , :]
        that_vector.reshape(X.shape[1] , 1)
        dist = numpy.linalg.norm(this_vector - that_vector)
        recommended[i] = dist
    recommended_list = sorted(recommended.items() , key=lambda x: x[1])
    for i in range(len(recommended_list)):
        if(i>=10):
            break
        print(df_posts['category'][recommended_list[i][0]])

In [1085]:
ind = int(input())
suggest_related_posts(ind)

 12


Drawings

Drawings
Drawings
Drawings
Drawings
Drawings
Drawings
Drawings
Drawings
Drawings
Drawings


In [1087]:
np.save('content_theta' , theta)
np.save('content_features' , X)