In [None]:
import time
import math
import numpy as np
import collections
from numpy import linalg as la
import json
import matplotlib.pyplot as plt 
import seaborn as sns
import networkx as nx

import pandas as pd
import random as rnd
data_path = '../data/one100K_v2.json'

#### Simply we load the data from our scrapped tweets file: one100K_v2.json to a list tweets

In [None]:
## LOAD OUR DATA FROM THE SCRAPPED TWEETS
docs_path = data_path
with open(docs_path) as fp:
    lines = fp.readlines()
tweets = [l.strip().replace(' +', ' ') for l in lines]

In [None]:
print("Total numer of tweets in the corpus: {}" .format(len(tweets)))

#### Create a dictionary for each tweet
For each tweet, creates a dictionary containing the most relevant information of it (Username, OriginalText, Clean Tokens, number of Likes, number of retweets, list of URLs...)
    
    Argument:  tweet -- a JSON tweet content    
    Returns:   dictRelevantInfo -- a dictionary with the processed tweet

In [None]:
def getRelevantInfo(tweet):
    dictRelevantInfo ={}
    data = json.loads(tweet)
    hashtags = []
    urlsList = []
    text = ''
    date = data['created_at'] 
    
    ## TRY TO OBTAIN INFORMATION ABOUT THE RETWEETED TWEET IF 'tweet' IS A RT.
    try:
        isRt=True
        isRetweet=data["retweeted_status"]
        idTweet=data["id_str"]
        text = isRetweet['text']
        usernamert=isRetweet['user']['screen_name']
        username = data['user']['screen_name']
        urls = isRetweet['entities']['urls']
        rt_count = isRetweet['retweet_count']
        likes = isRetweet['favorite_count']
        id_retweet=isRetweet["id_str"]
        for h in isRetweet['entities']['hashtags']:
            hashtags.append(h['text'])
        for url in urls:
            urlsList.append(url['url'])
            
    ## IF THE CURRENT TWEET IS NOT A RT WE OBTAIN INFORMATION ABOUT THE ORIGINAL.
    except:
        isRt=False
        idTweet=data["id_str"]
        text = data['text']
        username = data['user']['screen_name']
        urls = data['entities']['urls']
        rt_count=data['retweet_count']
        likes = data['favorite_count']
        id_retweet=None
        usernamert=None
        for h in data['entities']['hashtags']:
            hashtags.append(h['text'])
            
        for url in urls:
            urlsList.append(url['url'])        
            
    dictRelevantInfo['tweetID'] = idTweet
    dictRelevantInfo['text'] = text
    dictRelevantInfo['username'] = username
    dictRelevantInfo['date'] = date
    dictRelevantInfo['hashtags'] = hashtags
    dictRelevantInfo['likes'] = likes
    dictRelevantInfo['rt_count'] = rt_count
    dictRelevantInfo['urlsList'] = urlsList
    dictRelevantInfo['isRetweeted'] = isRt
    dictRelevantInfo['idRt'] = id_retweet
    dictRelevantInfo['usernameRT'] = usernamert
    return dictRelevantInfo

In [None]:
cleanTweets = {}
for t in tweets:
    currentTweet=getRelevantInfo(t)
    tweetID=currentTweet['tweetID']
    cleanTweets[tweetID] = currentTweet

In [None]:
print(len(cleanTweets))

#### Generate a graph from the interactions of users that Retweets.
We ignore all the tweets owner that are original, because not has interaction with other tweets, and later appear in the graph if other users retweet its tweets.
    
    Argument:  cleanTweets -- dictionary of dictionaries with all relevant info for each tweet
    Returns:   g -- a directed graph with edges between User 1 --> User 2 (where user1 retweet user2).

In [None]:
def getRtGraph(cleanTweets):
    freqRT={}
    g=nx.DiGraph()
    for t in cleanTweets:
        ## ONLY GENERATES A GRAPH WITH THE INTERACTIONS ON THE RETWEETS. IF THE CURRENT TWEET IS NOT A RT,
        ## WE IGNORE IT
        if cleanTweets[t]['isRetweeted']==True and cleanTweets[t]["idRt"] in cleanTweets:
            key=cleanTweets[t]["username"]+"->"+cleanTweets[t]["usernameRT"]
            if key in freqRT:
                freqRT[key] += 0.5
            else:
                freqRT[key] = 0.1
            # GENERATE EDGES (AND NODES, add_edge() ADDS NODES IF NOT EXIST), FROM THE RT USER TO ORIGINAL USER.
            g.add_edge(cleanTweets[t]["username"],cleanTweets[t]["usernameRT"],weight=freqRT[key])  
    return g, freqRT

In [None]:
graph, freqRT=getRtGraph(cleanTweets)

## WEIGHT EDGES
weighted_edges=[]
width=[]
for (u,v,data) in graph.edges(data=True):
    weighted_edges.append((u,v))
    width.append(data['weight'])

In [None]:
plt.figure(figsize=(20,10))
pos=nx.spring_layout(graph, iterations=5)
nx.draw_networkx_nodes(graph, pos, node_size=5, node_color='red') 
_=nx.draw_networkx_edges(graph,pos,edgelist=weighted_edges,width=width)
plt.show()

In [None]:
print(freqRT)

In [None]:
print(graph.number_of_nodes())
for x in graph.edges():
    print(x)

In [None]:
edges_shuffle=list(nx.to_edgelist(graph))

In [None]:

rnd.shuffle(edges_shuffle)

train=edges_shuffle[:int(len(edges_shuffle)*0.8)]
test=edges_shuffle[int(len(edges_shuffle)*0.8):]

In [None]:
print(len(edges_shuffle))
print(len(test))
print(len(train))

In [None]:
#### PAGE RANK
pagerank = nx.pagerank(graph)

pagerank_sorted=dict(sorted(pagerank.items(), key=lambda item: item[1] , reverse=True))
cont=0
for x in pagerank_sorted:
    try:
        print(cleanTweets[x])
    except:
        cont+=1
    print(x)
    print(pagerank_sorted[x])
    print()

print(cont)

In [None]:

for x in nx.non_neighbors(graph,'1334878351970938883'):
    print(x)


In [None]:
print(nx.number_of_nodes(graph))

In [None]:

g22=graph.to_undirected(reciprocal=False, as_view=False)
nodes_bons=[]
for g in g22.nodes():
    if g22.degree(g)>=1:
        nodes_bons.append(g)
print(len(nodes_bons))
ebunch=[]
for edg in nodes_bons:
    for edg2 in nodes_bons:
        if edg!=edg2:
            ebunch.append((edg,edg2))
            
prediction=nx.adamic_adar_index(g22,ebunch)

In [None]:
#pr2=dict(sorted(prediction.items(), key=lambda item: item[2] , reverse=True))
print(test2[0])
print(prediction)
for v in prediction:
    if v[2]!=0:
        print(v)
    

In [None]:
sparse_item_user = sparse.csr_matrix((data['event'].astype(float), (data['itemid'], data['visitorid'])))

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)alpha_val = 40
data_conf = (sparse_item_user * alpha_val).astype('double')model.fit(data_conf)
user_id =   14recommended = model.recommend(user_id, sparse_user_item)print(recommended)