In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
# import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

In [2]:
articles_df = pd.read_csv('shared_articles.csv')
interactions_df = pd.read_csv('users_interactions.csv')
articles_df.drop(['authorUserAgent', 'authorRegion', 'authorCountry'], axis=1, inplace=True)
interactions_df.drop(['userAgent', 'userRegion', 'userCountry'], axis=1, inplace=True)
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en


In [3]:
articles_df.shape,articles_df.contentId.nunique()

((3122, 10), 3057)

In [4]:
interactions_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714


In [5]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.drop('eventType', axis=1, inplace=True)
df = pd.merge(interactions_df[['contentId','personId', 'eventType']], articles_df[['contentId', 'title']], how = 'inner', on = 'contentId')
df.head()

Unnamed: 0,contentId,personId,eventType,title
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
1,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
4,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem


In [6]:
df.eventType.value_counts()

VIEW               61043
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: eventType, dtype: int64

In [7]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}

df['eventStrength'] = df['eventType'].apply(lambda x: event_type_strength[x])
df.sample(10)

Unnamed: 0,contentId,personId,eventType,title,eventStrength
7366,6044362651232258738,4067645678703694326,VIEW,Cinco competências comportamentais para você s...,1.0
1756,-9192549002213406534,-1032019229384696495,VIEW,Chrome OS now has Material Design for the desktop,1.0
7826,-730957269757756529,2553895156129400476,VIEW,Por que a limitação da banda larga é uma forma...,1.0
25742,-9222795471790223670,-1032019229384696495,FOLLOW,Uber lança serviço de helicóptero em SP com pr...,4.0
13720,-8287402887944984163,-2626634673110551643,VIEW,Containers are not VMs,1.0
10282,279771472506428952,-1443636648652872475,LIKE,5 Unique Features Of Google Compute Engine Tha...,2.0
14238,8163984238975480171,-3508202263850468997,VIEW,Apple e Google querem que você pague por assin...,1.0
63529,4904775878165401439,-8670749047273764903,BOOKMARK,My Approach to Mobile Accessibility Testing - ...,3.0
68946,8185199775369928004,-1352064057049251194,VIEW,Se eu fosse falar de todas as ideias péssimas ...,1.0
61676,7814856426770804213,781132119141619887,VIEW,New MacBook Pro is not a Laptop for Developers...,1.0


In [8]:
df.shape

(72269, 5)

In [9]:
df = df.drop_duplicates()
grouped_df = df.groupby(['personId', 'contentId', 'title']).sum().reset_index()
grouped_df.sample(10)

Unnamed: 0,personId,contentId,title,eventStrength
12360,-2979881261169775358,-4205346868684833897,"Google launches Springboard, an AI-powered ass...",1.0
2482,-8399605302938582500,-8511291357261863413,Dark Scrum,1.0
11771,-3325685634318512394,7419040071212162906,This 8-Page Website Generated Six-Figures in i...,1.0
3307,-7990997793599977496,3998743085432844199,Netflix emociona com vídeo destacando personag...,3.0
38096,7911347399014555575,9175693555063886126,15 minutos sobre Docker,1.0
5067,-6944500707172804068,6245165134513326654,ASSISTA: show une 100 drones e uma orquestra s...,1.0
38158,7948079555216525045,-2948321821574578861,Quando usar paginação e quando usar scroll inf...,1.0
25505,2279740393166882579,-3338916066794638254,Iniciando com o Docker: Criando suas próprias ...,1.0
38920,8414731042150985013,809601605585939618,​Microsoft and Canonical partner to bring Ubun...,1.0
26680,2804812532901416058,5317978293082383492,Running Kubernetes Locally via Docker,1.0


In [10]:
df.shape

(50910, 5)

In [11]:
grouped_df['title'] = grouped_df['title'].astype("category")
grouped_df['personId'] = grouped_df['personId'].astype("category")
grouped_df['contentId'] = grouped_df['contentId'].astype("category")
grouped_df['person_id'] = grouped_df['personId'].cat.codes
grouped_df['content_id'] = grouped_df['contentId'].cat.codes
grouped_df.sample(10)

Unnamed: 0,personId,contentId,title,eventStrength,person_id,content_id
19093,-519417515410583677,-4483059513836093676,Grow fast or die slow: Focusing on customer su...,1.0,876,776
34428,5683029675627635125,-3040610224044779845,Things you probably didn't know you could do w...,1.0,1522,1006
20172,-108842214936804958,5842191581670322333,3 Pillars of the Most Successful Tech Products,3.0,926,2421
26182,2551221442168658664,2690205894709699741,8 of the Sweetest Words for Candy,1.0,1206,1904
3975,-7472538368234044044,-4509487968959834430,O valor de seu trabalho de coaching vai depend...,3.0,178,770
28350,3429602690322213789,-1484165300616478995,Slack and Uber's parting proves when apps unit...,1.0,1301,1231
13987,-2626634673110551643,9215261273565326920,"Meet Mycroft, the open source AI who wants to ...",1.0,682,2975
12530,-2979881261169775358,2581138407738454418,10 Modern Software Over-Engineering Mistakes,1.0,651,1881
16175,-1443636648652872475,-6968139157190736175,Financial services firm processes 25 billion s...,1.0,798,373
32298,4313045637915476309,6153710684104095269,"Testes mostram um ótimo desempenho do ""Ubuntu ...",1.0,1394,2474


In [12]:
sparse_content_person = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['content_id'], grouped_df['person_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['person_id'], grouped_df['content_id'])))
 
print(sparse_content_person.shape)
print(sparse_person_content.shape)

(2979, 1895)
(1895, 2979)


In [32]:
# np.save('sparse_content_person',sparse_content_person)
# np.save('sparse_person_content',sparse_person_content)
# data = np.load('sparse_content_person.npy',allow_pickle=True)

In [23]:
import pickle

In [26]:
with open('d.pickle','wb') as f:
    pickle.dump(sparse_content_person,f)

In [34]:
d = pickle.load(open('d.pickle','rb'))