In [1]:
import pandas as pd

## 1.1 Reading shared_articles data

In [2]:
articles_df = pd.read_csv('data/shared_articles.csv')
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']

In [4]:
articles_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en


In [5]:
articles_df['url'][1]

'http://www.nytimes.com/2016/03/28/business/dealbook/ethereum-a-virtual-currency-enables-transactions-that-rival-bitcoins.html'

In [25]:
articles_df['lang'].value_counts()

en    2211
pt     829
la       3
es       2
ja       2
Name: lang, dtype: int64

In [34]:
articles_df.groupby(['authorPersonId']).size()

authorPersonId
-9120685872592674274    10
-9047547311469006438     4
-9016528795238256703    14
-9009798162809551896     5
-9001583565812478106     2
                        ..
 8841741572929644986     1
 8968131284214320024    12
 9038446466275805109     1
 9109075639526981934     6
 9210530975708218054     2
Length: 252, dtype: int64

## 1.2 Reading user_interactions data

In [76]:
interactions_df = pd.read_csv('data/users_interactions.csv')

In [77]:
interactions_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


In [78]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [83]:
t = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
t[t < 5]

personId
-9150583489352258206    1
-9099478998637725255    2
-9083704948999852989    3
-9048557723087354030    3
-9012030317377670760    1
                       ..
 9001399366994458639    1
 9008788011897883731    2
 9042983166068668843    1
 9128735267433534024    1
 9152852494775096912    2
Length: 755, dtype: int64

In [67]:
person_id = -1479311724257856983
interactions_df.loc[interactions_df['personId'] == person_id]['eventStrength']

29       1.0
46       1.0
583      1.0
755      1.0
760      2.5
        ... 
67815    1.0
69228    1.0
71505    1.0
71609    1.0
71914    1.0
Name: eventStrength, Length: 375, dtype: float64

In [73]:
interactions_df.loc[interactions_df['personId'] == person_id,'eventStrength']

29       1.0
46       1.0
583      1.0
755      1.0
760      2.5
        ... 
67815    1.0
69228    1.0
71505    1.0
71609    1.0
71914    1.0
Name: eventStrength, Length: 375, dtype: float64

In [74]:
event_type_strength = {
   'VIEW': 2.0,
   'LIKE': 2.0, 
   'BOOKMARK': 4.5, 
   'FOLLOW': 5.0,
   'COMMENT CREATED': 2.0,  
}
person_id = -1479311724257856983
interactions_df.loc[interactions_df['personId'] == person_id,'eventStrength'] = interactions_df.loc[interactions_df['personId'] == person_id,'eventType'].apply(lambda x: event_type_strength[x])
# interactions_df[interactions_df['personId'] == person_id]['eventStrength']

In [75]:
interactions_df.loc[interactions_df['personId'] == person_id,'eventStrength']

29       2.0
46       2.0
583      2.0
755      2.0
760      4.5
        ... 
67815    2.0
69228    2.0
71505    2.0
71609    2.0
71914    2.0
Name: eventStrength, Length: 375, dtype: float64

In [49]:
interactions_df.loc[interactions_df['personId'] == person_id, 'eventStrength'] = 1

In [50]:
interactions_df.loc[interactions_df['personId'] == person_id, 'eventStrength']

29       1.0
46       1.0
583      1.0
755      1.0
760      1.0
        ... 
67815    1.0
69228    1.0
71505    1.0
71609    1.0
71914    1.0
Name: eventStrength, Length: 375, dtype: float64

## 2.


In [8]:
users_interactions_count = interactions_df.groupby(['personId']).size()
users_interactions_count[:5]

personId
-9223121837663643404    63
-9212075797126931087     9
-9207251133131336884    14
-9199575329909162940    15
-9196668942822132778    10
dtype: int64

In [19]:
users_interactions_count.loc[users_interactions_count >= 5]

personId
-9223121837663643404     63
-9212075797126931087      9
-9207251133131336884     14
-9199575329909162940     15
-9196668942822132778     10
                       ... 
 9165571805999894845      8
 9187866633451383747     26
 9191849144618614467     19
 9199170757466086545     26
 9210530975708218054    135
Length: 1311, dtype: int64

In [15]:
users_intses = interactions_df.groupby(['personId', 'sessionId']).size()
users_intses

personId              sessionId           
-9223121837663643404  -9034528865741269253    1
                      -8865295409778928525    1
                      -7824685088995468735    1
                      -7208932310643919246    2
                      -6503497614851251330    1
                                             ..
 9210530975708218054   7645120069872730318    2
                       8377960100985815035    1
                       8656235142853766978    3
                       8710635010472328184    3
                       9104250425956970239    2
Length: 28740, dtype: int64

In [31]:
interactions_df.groupby('eventType').size()

eventType
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
LIKE                5745
VIEW               61086
dtype: int64

In [29]:
interactions_df.shape[0]

72312

In [33]:
(interactions_df.groupby('eventType', as_index=True).size()/interactions_df.shape[0])

eventType
BOOKMARK           0.034061
COMMENT CREATED    0.022278
FOLLOW             0.019457
LIKE               0.079447
VIEW               0.844756
dtype: float64