In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

<center><h1>&#10024; Social Media Article Recommendation System (Part - 2)&#10024;</h1></center>

In [5]:
%matplotlib inline 
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd 
from scipy import stats
import seaborn as sns
from pandas import DataFrame
import operator
import nltk

In [6]:
df_users = pd.read_csv("final_interactions_df.csv")
df_articles = pd.read_csv("final_articles_df.csv")

In [7]:
df_users.tail(10)

Unnamed: 0,contentId,personId,sessionId,score
72304,-2069509552243850466,-9196668942822132778,-8300596454915870873,0.204575
72305,3770082505811422124,5713241217519616260,6704552481483665372,0.167667
72306,569574447134368517,-5230721907253934520,-1055756461332933762,0.445143
72307,569574447134368517,-5230721907253934520,-1055756461332933762,0.165143
72308,5484061377044071389,3609194402293569455,-344378995821744418,0.165143
72309,-6590819806697898649,-9016528795238256703,8614469745607949425,0.343629
72310,-5813211845057621660,102305705598210278,5527770709392883642,0.203629
72311,-1999468346928419252,-9196668942822132778,-8300596454915870873,0.165143
72312,-6590819806697898649,-9016528795238256703,8614469745607949425,0.203629
72313,5669290109546991426,5713241217519616260,6704552481483665372,0.168928


In [8]:
df_articles.head()

Unnamed: 0,contentId,authorPersonId,authorSessionId,contentType,title,text,lang,datetime,score_y
0,-4110354420726924665,4340306774493623681,8940341205206233829,HTML,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,2016-03-28 15:39:48,0.157666
1,-7292285110016212249,4340306774493623681,8940341205206233829,HTML,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en,2016-03-28 15:42:26,0.199773
2,-6151852268067518688,3891637997717104548,-1457532940883382585,HTML,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,2016-03-28 15:47:54,2.967408
3,2448026894306402386,4340306774493623681,8940341205206233829,HTML,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en,2016-03-28 15:48:17,0.0
4,-2826566343807132236,4340306774493623681,8940341205206233829,HTML,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en,2016-03-28 15:48:42,0.324164


In [9]:
df_users = pd.merge(df_users, df_articles[['contentId','authorPersonId','title',]], on='contentId', how ='left')

<h2>&#10024; Let's see how the text within articles look like! </h2>

In [10]:
# check texts
df_articles['text'].head(20)

0     All of this work is still very early. The firs...
1     The alarm clock wakes me at 8:00 with stream o...
2     We're excited to share the Google Data Center ...
3     The Aite Group projects the blockchain market ...
4     One of the largest and oldest organizations fo...
5     It will take time until banks come around to t...
6     When most people think about computers and rob...
7     Bitcoin.com spoke with the OpenLedger CEO, Ron...
8     Ethereum, considered by many to be the most pr...
9     A queda nas vendas e a deterioração na situaçã...
10    HTTP(S) load balancing provides global load ba...
11    Alpha This is an Alpha release of Setting Up S...
12    You may know Dell as a computer and server mak...
13    The Uber model just doesn't work for other ind...
14    Industrial adoption of IoT dubbed as Industria...
15    Artigos e Palestras ARTIGOS / 2015 12/08/2015 ...
16    Do Se Te Qu Qu Se Sa 27 28 29 30 31 Faça downl...
17    Five Bitcoin and Ethereum Based Projects t

In [11]:
# feature extraction

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df_articles['text'] = df_articles['text'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df_articles['text'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(3010, 72314)

In [12]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df_articles.index, index=df_articles['title']).drop_duplicates()

In [14]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    article_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df_articles['title'].iloc[article_indices]

In [15]:
df_articles[['title','text']].head()

Unnamed: 0,title,text
0,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...
1,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...
2,Google Data Center 360° Tour,We're excited to share the Google Data Center ...
3,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...
4,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...


In [16]:
get_recommendations('Google Data Center 360° Tour')

133     Google shares data center security and design ...
635                          This year's Founders' Letter
236     Google Cloud Platform: The smart person's guid...
848     YouTube's New Messenger Means You'll Never Hav...
514     Top 5 GCP NEXT breakout sessions on YouTube (s...
827     Google I/O 2016 Preview: A Chrome/Android merg...
896     Google I/O 2016 preview: Android N, Android VR...
2778    [Tools] How to Record your Desktop Screen with...
900     Google I/O 2016 Preview: Machine Learning, Vir...
76      Google admits original enterprise cloud strate...
Name: title, dtype: object

<center><h1>&#10024; Recommending based on Score&#10024;</h1></center>

In [17]:
def Recommend_user_preference(n):
    
    tmp_1 =df_users.loc[df_users['personId'] == n].sort_values(by='score', ascending=False)

    content_ID = tmp_1['contentId'].iloc[0]
    
    tmp_2 = df_articles.loc[df_articles['contentId'] == content_ID]

    title_content = tmp_2['title'].iloc[0]
   
    articles_results = get_recommendations(title_content)
    
    articles_tmp = df_users.loc[df_users['personId'] == n]
    
    recommendation_results = articles_results[~articles_results.isin(articles_tmp['title'])]
                                                                                                      
    return recommendation_results

In [18]:
Recommend_user_preference(-9016528795238256703)


526     Bots won't replace apps. Better apps will repl...
2304    Google will show AMP URLs before App deep link...
320     The End Of Apps As We Know Them - Inside Intercom
2423    How to use Docker to run ASP.NET Core apps on ...
2169    What 2 Years of Android Development Have Taugh...
835     WhatsApp just released desktop apps for Mac an...
627     5 reasons your employees aren't sharing their ...
Name: title, dtype: object

<center><h1>&#10024; Recommending new authors to users&#10024;</h1></center>

In [19]:
def Recommend_authors_to_user(n):
    
    title_result = Recommend_user_preference(n)
    
    author_id_list = df_articles[df_articles['title'].isin(title_result)]['authorPersonId']
    
    articles_tmp = df_users[df_users['personId']==n]
    
    recommendation_results = author_id_list[~author_id_list.isin(articles_tmp['authorPersonId'])]
                                                                                                      
    return recommendation_results
    

In [20]:
Recommend_authors_to_user(5713241217519616260)

421      -48161796606086482
981     -492179582639058400
1654    3609194402293569455
1723   -9120685872592674274
2141    3891637997717104548
2258   -3535274684588209118
2465    5660542693104786364
2556    3609194402293569455
2650    3609194402293569455
Name: authorPersonId, dtype: int64

<center><h1>&#10024; Count Views&#10024;</h1></center>

In [21]:
df1 = df_users.groupby('contentId', as_index=False).agg({"score": "sum"})
df1.head(5)

Unnamed: 0,contentId,score
0,-9222795471790223670,12.536278
1,-9216926795620865886,7.198155
2,-9194572880052200111,6.889131
3,-9192549002213406534,19.426535
4,-9190737901804729417,2.353585


In [22]:
df2 =  df_users.groupby('contentId').contentId.count()
df2.head(5)

contentId
-9222795471790223670    26
-9216926795620865886    21
-9194572880052200111    29
-9192549002213406534    56
-9190737901804729417     9
Name: contentId, dtype: int64

In [23]:
df2 = pd.DataFrame(np.array(df2))
df2.columns=['Views']
df2.head(5)

Unnamed: 0,Views
0,26
1,21
2,29
3,56
4,9


In [24]:
df3=df_users['contentId']
df3.head(5)

0   -3499919498720038879
1    8890720798209849691
2     310515487419366995
3     310515487419366995
4   -7820640624231356730
Name: contentId, dtype: int64

In [25]:
df4 = pd.concat([df3,df2],axis=1, ignore_index=True)
df4.columns=['contentId','Views']
df4.head()

Unnamed: 0,contentId,Views
0,-3499919498720038879,26.0
1,8890720798209849691,21.0
2,310515487419366995,29.0
3,310515487419366995,56.0
4,-7820640624231356730,9.0


In [26]:
articles_influence= pd.concat([df1, df2], axis=1)
articles_influence.columns=['contentId','Popularity','Views']

In [27]:
articles_influence.head(20)

Unnamed: 0,contentId,Popularity,Views
0,-9222795471790223670,12.536278,26
1,-9216926795620865886,7.198155,21
2,-9194572880052200111,6.889131,29
3,-9192549002213406534,19.426535,56
4,-9190737901804729417,2.353585,9
5,-9189659052158407108,9.812936,36
6,-9184137057748005562,0.254752,1
7,-9176143510534135851,10.351386,33
8,-9172673334835262304,0.198827,1
9,-9171475473795142532,1.148127,5


In [28]:
df5=df_articles[['contentId','title','text']]
df5.head()

Unnamed: 0,contentId,title,text
0,-4110354420726924665,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...
1,-7292285110016212249,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...
2,-6151852268067518688,Google Data Center 360° Tour,We're excited to share the Google Data Center ...
3,2448026894306402386,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...
4,-2826566343807132236,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...


In [95]:
df_articles_influence = pd.merge(df5, articles_influence, on=['contentId'], how='left').fillna(0)

In [96]:
df_articles_influence.sort_values(by='Popularity', ascending=False)

Unnamed: 0,contentId,title,text,Popularity,Views
1247,8657408509986329668,Pull request first - Practical Blend,Pull request first After two years of working ...,126.001895,294.0
553,-6843047699859121724,"Ganhe 6 meses de acesso ao Pluralsight, maior ...","Ganhe 6 meses de acesso ao Pluralsight, maior ...",112.896506,281.0
1097,2857117417189640073,Running GV sprints inside corporates - learn f...,Running GV sprints inside corporates - learn f...,98.015931,241.0
1729,-6783772548752091658,Livro: Retrospectivas Divertidas,"Neste livro, nós fornecemos um conjunto de fer...",93.926604,294.0
1591,-133139342397538859,"Novo workaholic trabalha, pratica esportes e t...",Novo workaholic não abre mão do esporte e da f...,89.475553,315.0
...,...,...,...,...,...
599,-1549326815285717811,"Machine Learning, AI, and the Emperor's Vest","Machine Learning, AI, and the Emperor's Vest T...",0.000000,0.0
1249,-4880614998012292765,New smart toothbrush from Philips Sonicare is ...,The most high-tech room in your house may soon...,0.000000,0.0
1345,-8120372343967636100,Intel x86s hide The Intel Management Engine (ME),Recent Intel x86 processors implement a secret...,0.000000,0.0
373,-7863574628164051955,Announcing TensorFlow 0.8 - now with distribut...,Google uses machine learning across a wide ran...,0.000000,0.0


<center><h1>&#10024; Recommending users&#10024;</h1></center>

In [31]:
# n = personId ; m = contentId

# recommend top 10 users you may like

def Recommend_users_youmaylike(n, m):

    user_list = df_users.loc[df_users['contentId'] == m]
    
    user_list = user_list.loc[user_list['personId'] != n]
    
    user_list = user_list.sort_values('score', ascending=False)
    
    result = user_list['personId'].unique()
    
    recommendation_results = result[0:9]
    
    return recommendation_results

In [32]:
Recommend_users_youmaylike(2653698047369148236, -1633984990770981161)

array([  268671367195911338, -2314784854300761836,  2195040187466632600,
        5200962297848391505, -2470248588564221167,  3861743074882551715,
        2833428826475063405,  6571439471025671820,  1623838599684589103])

<center><h1>&#10024; Generate Tags and Tags Recommendation&#10024;</h1></center>

## Generate Tags

In [33]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [34]:
# get the full content with the "title + text"
df_articles['content']= df_articles['title'] + df_articles['text']

# take one article as example

text = df_articles['content'][1]

In [35]:
# get all the keywords within selected labels
doc = nlp(text)
print([(X.text, X.label_) for X in doc.ents if X.label_ not in ['DATE','CARDINAL','ORDINAL','MONEY','TIME']])

[('Bitcoin', 'GPE'), ('USDcoin', 'ORG'), ('BTC', 'ORG'), ('Trump', 'PRODUCT'), ('BTC', 'ORG'), ('TradeBot', 'ORG'), ('USDcoin', 'ORG'), ('Trump', 'PRODUCT'), ('USDcoin', 'ORG'), ('Trump', 'PERSON'), ('USDcoin', 'ORG'), ('Confidential Transactions', 'ORG'), ('Trump', 'PERSON'), ('CNYcoin', 'ORG'), ('GBPcoin', 'GPE'), ('Branson', 'PERSON'), ('British', 'NORP'), ('Britain', 'GPE'), ('the Great Debt Default', 'EVENT'), ('the GoatData Project', 'ORG'), ('TeachBot', 'ORG'), ('TeachBot', 'ORG'), ('BTC', 'ORG'), ('Sherpas', 'ORG'), ('Tibetan', 'NORP'), ('TeachBot', 'ORG'), ('GoatData', 'ORG'), ('WinterHoof', 'ORG'), ('the Artificial General Intelligence', 'ORG'), ('Swiss', 'NORP'), ('BTC', 'ORG'), ('Satoshi City', 'GPE'), ('Mars', 'LOC')]


In [36]:
article = nlp(text)
len(article.ents)

42

In [37]:
labels = [x.label_ for x in article.ents if x.label_ not in ['DATE','CARDINAL','ORDINAL','MONEY','TIME']]
Counter(labels)

Counter({'GPE': 4,
         'ORG': 19,
         'PRODUCT': 2,
         'PERSON': 3,
         'NORP': 3,
         'EVENT': 1,
         'LOC': 1})

In [38]:
# Count the frequncy of each keywords
items = [x.text for x in article.ents if x.label_ not in ['DATE','CARDINAL','ORDINAL','MONEY','TIME']]
Counter(items).most_common(20)

[('USDcoin', 4),
 ('BTC', 4),
 ('Trump', 4),
 ('TeachBot', 3),
 ('Bitcoin', 1),
 ('TradeBot', 1),
 ('Confidential Transactions', 1),
 ('CNYcoin', 1),
 ('GBPcoin', 1),
 ('Branson', 1),
 ('British', 1),
 ('Britain', 1),
 ('the Great Debt Default', 1),
 ('the GoatData Project', 1),
 ('Sherpas', 1),
 ('Tibetan', 1),
 ('GoatData', 1),
 ('WinterHoof', 1),
 ('the Artificial General Intelligence', 1),
 ('Swiss', 1)]

In [39]:
# take the top 5 most frequent keywords as tags of this article
x = Counter(items)
tags = sorted(x, key=x.get, reverse=True)
tags[0:5]

['USDcoin', 'BTC', 'Trump', 'TeachBot', 'Bitcoin']

## Generate tags for each article in our dataset

In [40]:
def get_tags(x):

    article = nlp(x)
    
    items = [x.text for x in article.ents if x.label_ not in ['DATE','CARDINAL','ORDINAL','MONEY']]
    
    tags_value = Counter(items)
    
    tags = sorted(tags_value, key=tags_value.get, reverse=True)
    
    return tags[0:5]

In [41]:
df_articles['tags'] = df_articles['content'].apply(lambda x: get_tags(x))

In [42]:
# drop duplicate by a column name
 
df_articles = df_articles.drop_duplicates(['title'], keep='last')

df_articles = df_articles.drop_duplicates(['contentId'], keep='last')

df_articles

Unnamed: 0,contentId,authorPersonId,authorSessionId,contentType,title,text,lang,datetime,score_y,content,tags
0,-4110354420726924665,4340306774493623681,8940341205206233829,HTML,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en,2016-03-28 15:39:48,0.157666,"Ethereum, a Virtual Currency, Enables Transact...","[Bitcoin, Microsoft, Buterin, Ethereum, Fortress]"
1,-7292285110016212249,4340306774493623681,8940341205206233829,HTML,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en,2016-03-28 15:42:26,0.199773,Bitcoin Future: When GBPcoin of Branson Wins O...,"[USDcoin, BTC, Trump, TeachBot, Bitcoin]"
2,-6151852268067518688,3891637997717104548,-1457532940883382585,HTML,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en,2016-03-28 15:47:54,2.967408,Google Data Center 360° TourWe're excited to s...,"[YouTube, Cardboard, Google Cloud Platform, Go..."
3,2448026894306402386,4340306774493623681,8940341205206233829,HTML,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en,2016-03-28 15:48:17,0.000000,"IBM Wants to ""Evolve the Internet"" With Blockc...","[IBM, Bitcoin, Wolpert, Linux, Ripple]"
4,-2826566343807132236,4340306774493623681,8940341205206233829,HTML,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en,2016-03-28 15:48:42,0.324164,IEEE to Talk Blockchain at Cloud Computing Oxf...,"[blockchain, IEEE, Talk Blockchain, Cloud Comp..."
...,...,...,...,...,...,...,...,...,...,...,...
3005,9213260650272029784,3609194402293569455,7144190892417579456,HTML,"Conheça a Liga IoT, plataforma de inovação abe...","A Liga Ventures, aceleradora de startups espec...",pt,2017-02-24 09:30:04,1.854600,"Conheça a Liga IoT, plataforma de inovação abe...","[Intel, Liga Ventures, como agricultura, varej..."
3006,-3295913657316686039,6960073744377754728,-8193630595542572738,HTML,Amazon takes on Skype and GoToMeeting with its...,"Amazon has launched Chime, a video conferencin...",en,2017-02-24 09:37:47,0.474570,Amazon takes on Skype and GoToMeeting with its...,"[Chime, Amazon, Microsoft, Pro, Skype]"
3007,3618271604906293310,1908339160857512799,-183341653743161643,HTML,Code.org 2016 Annual Report,"February 9, 2017 - We begin each year with a l...",en,2017-02-27 14:20:24,0.153143,"Code.org 2016 Annual ReportFebruary 9, 2017 - ...","[CS, AP, U.S., 48%, NSF]"
3008,6607431762270322325,-1393866732742189886,2367029511384577082,HTML,JPMorgan Software Does in Seconds What Took La...,"At JPMorgan Chase & Co., a learning machine is...",en,2017-02-28 11:51:59,0.155035,JPMorgan Software Does in Seconds What Took La...,"[JPMorgan, Zames, COIN, Kholood Eid/Bloomberg,..."


In [43]:
articles_influence

Unnamed: 0,contentId,Popularity,Views
0,-9222795471790223670,12.536278,26
1,-9216926795620865886,7.198155,21
2,-9194572880052200111,6.889131,29
3,-9192549002213406534,19.426535,56
4,-9190737901804729417,2.353585,9
...,...,...,...
2982,9213260650272029784,1.854600,11
2983,9215261273565326920,11.876404,30
2984,9217155070834564627,5.999776,16
2985,9220445660318725468,17.292027,52


In [44]:
df_articles = pd.merge(df_articles, articles_influence, how="left")

df_articles = df_articles.fillna('0')

In [45]:
df_articles['Popularity'] = df_articles['Popularity'].astype(float)
df_articles['Views'] = df_articles['Views'].astype(int)

In [46]:
df_articles[['contentId','authorPersonId','title','text','tags','Popularity','Views']].head(5)

Unnamed: 0,contentId,authorPersonId,title,text,tags,Popularity,Views
0,-4110354420726924665,4340306774493623681,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,"[Bitcoin, Microsoft, Buterin, Ethereum, Fortress]",0.157666,1
1,-7292285110016212249,4340306774493623681,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,"[USDcoin, BTC, Trump, TeachBot, Bitcoin]",0.199773,1
2,-6151852268067518688,3891637997717104548,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,"[YouTube, Cardboard, Google Cloud Platform, Go...",2.967408,13
3,2448026894306402386,4340306774493623681,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,"[IBM, Bitcoin, Wolpert, Linux, Ripple]",0.0,0
4,-2826566343807132236,4340306774493623681,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,"[blockchain, IEEE, Talk Blockchain, Cloud Comp...",0.324164,2


## Usecase: results by searching a tag

when a user search the tag: #microsoft 

In [47]:
def get_search_result(tag_name):
    
    tag_name = tag_name.lower()
    
    tmp = df_articles[['contentId','authorPersonId','title','text','tags','Popularity','Views']].copy()
    
    tmp['tags'] = tmp['tags'].apply(lambda x: [n.lower() for n in x])
    
    tmp['result'] = tmp['tags'].apply(lambda x: tag_name in x)
    
    tmp['result'] = tmp['result'].astype(str)
    
    tmp = tmp[tmp['result'] == 'True'][['contentId','result']]
    
    tmp1 = df_articles[['contentId','authorPersonId','title','text','tags','Popularity','Views']].copy()
    
    tmp1 = pd.merge(tmp1, tmp, how ='inner')
    
    return tmp1[['contentId','authorPersonId','title','text','tags','Popularity','Views']].sort_values(by='Views',ascending=False)
    

In [48]:
get_search_result('microsoft')

Unnamed: 0,contentId,authorPersonId,title,text,tags,Popularity,Views
36,-6843047699859121724,7527226129639571966,"Ganhe 6 meses de acesso ao Pluralsight, maior ...","Ganhe 6 meses de acesso ao Pluralsight, maior ...","[Pluralsight, plataforma de treinamento, Micro...",112.896506,281
70,-8518096793350810174,1895326251577378793,"Microsoft adquire LinkedIn por US$ 26,2 bilhões",O mercado de tecnologia começou a semana em ri...,"[LinkedIn, Microsoft, Nadella, bilhõesO, dessa...",59.500157,156
58,2285214528595997209,-9009798162809551896,Docker e .Net,Oi pessoal! Rodar .Net em container? SIM!! é p...,"[Microsoft, para isso, vem, NuGet, Suporte]",57.177634,142
60,3306277069425849869,9109075639526981934,Google segue Microsoft e lança ferramenta anal...,O Google está intensificando seu compromisso c...,"[Google, Data Studio, Microsoft, grátisO Googl...",40.313487,101
130,-2097075598039554565,3609194402293569455,"The Languages, Frameworks and Tools You Should...","The Languages, Frameworks and Tools You Should...","[JavaScript, JS, TypeScript, Microsoft, Ruby]",20.320173,95
...,...,...,...,...,...,...,...
33,8870251820986769829,6971525809430309144,Facebook adds Windows 10 UWP support for makin...,Facebook is expanding the reach of apps that a...,"[React Native, Microsoft, Windows 10, UWP, F8]",0.321855,1
123,-9152398073968262186,-7496361692498935601,"Microsoft Is Now 'Open By Default', Says Xamar...",What's it like to produce an open source produ...,"[Microsoft, Xamarin, de Icaza, Mono, Windows]",0.164165,1
0,-4110354420726924665,4340306774493623681,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,"[Bitcoin, Microsoft, Buterin, Ethereum, Fortress]",0.157666,1
8,8082202054464515448,-7531858294361854119,"Com HoloLens, Microsoft torna realidade a mens...",As recentes demonstrações do HoloLens da Micro...,"[Microsoft, Star Wars""As, zumbi ou, unicórnio,...",0.000000,0


## Usecase: tags recommandation on a new article

In [49]:
df_articles[df_articles['Views'] == 0].head(3)

Unnamed: 0,contentId,authorPersonId,authorSessionId,contentType,title,text,lang,datetime,score_y,content,tags,Popularity,Views
3,2448026894306402386,4340306774493623681,8940341205206233829,HTML,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en,2016-03-28 15:48:17,0.0,"IBM Wants to ""Evolve the Internet"" With Blockc...","[IBM, Bitcoin, Wolpert, Linux, Ripple]",0.0,0
5,-2148899391355011268,4340306774493623681,8940341205206233829,HTML,Banks Need To Collaborate With Bitcoin and Fin...,It will take time until banks come around to t...,en,2016-03-28 15:49:17,0.0,Banks Need To Collaborate With Bitcoin and Fin...,"[Bitcoin, Fintech, the Monetary Authority of S...",0.0,0
33,3037840448416371691,4340306774493623681,-7292854281461484137,HTML,Bitcoin Wallets as Swiss Bank Accounts: The De...,Bitcoin was seemingly dragged into the very pu...,en,2016-03-29 13:06:48,0.0,Bitcoin Wallets as Swiss Bank Accounts: The De...,"[Bitcoin, Swiss, Bitcoin Magazine, Obama, Brea...",0.0,0


assume the article whose contentID is 2448026894306402386 is a new article just been uploaded, and the author can get tags recommendation automatically based on other popular& similar articles

In [50]:
def get_popular_recommendation(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:200]

    # Get the movie indices
    article_indices = [i[0] for i in sim_scores]

    # Get similar movies and sort them by Popularity
    result = df_articles[['contentId','authorPersonId','title','text','tags','Popularity','Views']].iloc[article_indices]

    # Return the top 10 most similar & popular movies
    
    return result[result['Views']>0].sort_values(by='Popularity',ascending = False).head(10)

In [51]:
df_articles[df_articles['contentId']==2448026894306402386]['title'].values[0]

'IBM Wants to "Evolve the Internet" With Blockchain Technology'

In [52]:
def get_tags(contentId):
    
    title_content = df_articles[df_articles['contentId']==contentId]['title'].values[0]
    
    result = get_popular_recommendation(title_content)
    
    tags_list = result['tags'].values[0]+result['tags'].values[1]+result['tags'].values[2]+result['tags'].values[3]+result['tags'].values[4]+result['tags'].values[5]+result['tags'].values[6]+result['tags'].values[7]+result['tags'].values[8]+result['tags'].values[9]
    
    x = Counter(tags_list)
    
    tags = sorted(x, key=x.get, reverse=True)
    
    return tags[0:5]


In [53]:
get_tags(3037840448416371691)

['Google', 'Apple', 'blockchain', 'bitcoin', 'GoogleBank']

<center><h1>&#10024; Article Influence prediction and Evaluation&#10024;</h1></center>


<center>How many views I will probably get? / What's the popularity score I will probably get?</center>

# NLTK

In [61]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity 
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [62]:
sentim_analyzer = SentimentAnalyzer()
vectorizer = TfidfVectorizer("english") ### check tf-idf theory, input: arr of string, output: feature arr, column: tf-idf feature

# Feature engineer steps
#### 1 insight
#### 2 summarization
#### 3 remove puntucation
#### 4 remove stop words
#### 5 remove stemmization, lemmatization

#### 6 dimension reduction
#### 7 train test split

###### others features: n-gram

### Generate insight

In [1]:
def pre_process(text):
    
    ### Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    ### Remove stops words
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    ### Remove lemmatization
    wnl = nltk.WordNetLemmatizer()
    text = list(map(lambda x:wnl.lemmatize(x),text))
    ### Remove stemmization
    words = ""
    for i in text:
            stemmer = SnowballStemmer("english")
            words += (stemmer.stem(i))+" "
    print('done')
    return words

In [66]:
article_texts = df_articles_influence.text.values

In [67]:
article_texts[1]

'The alarm clock wakes me at 8:00 with stream of advert-free broadcasting, charged at one satoshi per second. The current BTC exchange rate makes that snooze button a costly proposition! So I get up, make coffee and go to my computer to check the overnight performance of my bots. TradeBot earns me on Trump and Branson TradeBot, which allocates funds between the main chain and various national currency side-chains, generated a lucrative 0.24 BTC return. TradeBot has been reliably profitable ever since I set it to trade USDcoin according to political prediction market data. As expected, the latest poll numbers came in as highly supportive of Trump\'s re-election as USDcoin CEO. Trump\'s resistance to de-anonymizing public spending, by moving USDcoin off the Confidential Transactions layer, continues to erode his coin\'s credibility. In his latest speech, Trump maintains that full CT-privacy is essential to "combatting CNYcoin\'s sinister ring-signature scheming." I make a note to increas

In [70]:
article_texts = np.array(list(map(lambda x:pre_process(x),article_texts)))

done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done


done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done


In [71]:
features = vectorizer.fit_transform(article_texts)
features.toarray().shape

(3010, 68210)

### Summarization

Text summarization refers to the technique of shortening long pieces of text. The intention is to create a coherent and fluent summary having only the main points outlined in the document.

In [73]:
from gensim.summarization.summarizer import summarize

In [74]:
article_texts[1]

'alarm clock wake 800 stream advertfre broadcast charg one satoshi per second current btc exchang rate make snooz button cost proposit get make coffe go comput check overnight perform bot tradebot earn trump branson tradebot alloc fund main chain various nation currenc sidechain generat lucrat 024 btc return tradebot reliabl profit ever sinc set trade usdcoin accord polit predict market data expect latest poll number came high support trump reelect usdcoin ceo trump resist deanonym public spend move usdcoin confidenti transact layer continu erod coin credibl latest speech trump maintain full ctprivaci essenti combat cnycoin sinist ringsignatur scheme make note increas long posit gbpcoin follow ceo branson memo effect govern financ nation bank brought complianc public blockchain british corrupt index flatlin first nation econmi go light britain lead global recoveri great debt default 20 happi goatdata project check teachbot note perform inlin expect teachbot serf autonom infoag various 

In [75]:
summarize(df_articles_influence['text'].values[1])

"TradeBot earns me on Trump and Branson TradeBot, which allocates funds between the main chain and various national currency side-chains, generated a lucrative 0.24 BTC return.\nAs expected, the latest poll numbers came in as highly supportive of Trump's re-election as USDcoin CEO.\nHappy with the GoatData Project I check TeachBot and note that it's performing in-line with expectations."

In [76]:
def summary(string,**kwargs):
    try:
        summarized = summarize(string,**kwargs)
    except:
        return string
    return summarized

### data manipulation

In [77]:
article_Summary = list(map(lambda x:summary(x),df_articles_influence['text'].values))
new_areticle_texts = list(map(lambda x:pre_process(x),article_Summary))

done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done


done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done
done


In [78]:
newFeatures = vectorizer.fit_transform(article_Summary)
newFeatures.toarray().shape

(3010, 38524)

### dimension reduction

In [79]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
# pca = PCA(n_components=500, svd_solver='full')
pca = TruncatedSVD(500)

In [80]:
type(newFeatures)

scipy.sparse.csr.csr_matrix

In [81]:
reducedFeatures = pca.fit_transform(newFeatures)

In [82]:
type(reducedFeatures)

numpy.ndarray

In [83]:
reducedFeatures.shape

(3010, 500)

In [84]:
newFeatures = reducedFeatures

# Modeling



In [85]:
from  sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.linear_model import LinearRegression

In [98]:
X = newFeatures
y = df_articles_influence['Popularity']

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=46)


In [100]:
from sklearn.ensemble import RandomForestRegressor

randomForest = RandomForestRegressor(random_state=42)
randomForest.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [101]:
randomForestPredict = randomForest.predict(X_test)
randomForest_mse = mean_squared_error(y_test, randomForestPredict)
randomForestMSE = np.sqrt(randomForest_mse)
randomForestMSE

11.642932325766349

><center><h1>&#10024; Find users you may like&#10024;</h1></center>

When a user follows an authur(article), the system will recommend several users that he/she may like

In [108]:
user_list = df_users.loc[df_users['contentId'] == 310515487419366995]



In [109]:
user_list = user_list.loc[user_list['personId'] != 344280948527967603]

In [114]:
user_list = user_list.sort_values('score', ascending=False)

user_list.head(5)

Unnamed: 0,contentId,personId,sessionId,score,authorPersonId,title
32213,310515487419366995,-3500661007957156229,1335241234835338187,0.912934,-5.527146e+18,71 erros de português que precisam sumir dos s...
7058,310515487419366995,3413008167249007087,1073600550561679603,0.912934,-5.527146e+18,71 erros de português que precisam sumir dos s...
7002,310515487419366995,-1032019229384696495,3621737643587579081,0.912934,-5.527146e+18,71 erros de português que precisam sumir dos s...
6978,310515487419366995,-1130272294246983140,2631864456530402479,0.912934,-5.527146e+18,71 erros de português que precisam sumir dos s...
9327,310515487419366995,1899177452305284666,8862752102171960091,0.912934,-5.527146e+18,71 erros de português que precisam sumir dos s...


In [115]:
result = user_list['personId'].unique()

In [116]:
result[0:9]

array([-3500661007957156229,  3413008167249007087, -1032019229384696495,
       -1130272294246983140,  1899177452305284666,  -709287718034731589,
        3609194402293569455, -8763398617720485024, -1578287561410088674])

In [119]:
# n = personId ; m = contentId

# recommend top 10 users you may like

def Recommend_users_youmaylike(n, m):
    

    user_list = df_users.loc[df_users['contentId'] == m]
    
    user_list = user_list.loc[user_list['personId'] != n]
    
    user_list = user_list.sort_values('score', ascending=False)
    
    result = user_list['personId'].unique()
    
    recommendation_results = result[0:9]
    
    return recommendation_results

In [120]:
Recommend_users_youmaylike(2653698047369148236, 1959495508923903948)

array([-3295609019604191449,  3007736603136734729, -2406209330529428019,
       -2979537012405607453,  8189031666388954162,  6947583688031316012,
       -3230911339419872436,  1874422396201148365, -3620817660824718175])

### Simple Evaluation

In [121]:
df_friendship = pd.read_csv("friendship.csv")

In [122]:
df_friendship.head(5)

Unnamed: 0,User A,User B,Friend (follow each other)
0,2653698047369148236,-3295609019604191449,1
1,2653698047369148236,3007736603136734729,1
2,2653698047369148236,8189031666388954162,0
3,2653698047369148236,6947583688031316012,1
4,2653698047369148236,-3230911339419872436,0


In [123]:
a = df_friendship['Friend (follow each other)']

b = np.sum(a)

score = b/10

print(score)

0.7
