In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from glob import glob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from tqdm import tqdm
import pickle
import re

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Error loading corpus: Package 'corpus' not found in index
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
dataset = ['Very excited for todays IPL contest RPS vs KKR, @msdhoni vs @GautamGambhir fight! #IPL','#poll who score 50 + score today #smithy #dhoni #stokes #Rahane #KKRvRPS #rpsvskkr #cricketlovers #ipl #IPL2017', 'RPS should be happy team today, because KKR have decided to rest NCN. He has been in prime form. #KKRvRPS #IPL @RPSupergiants @KKRiders', 'KKR seek to extend unbeaten run against Pune https://t.co/NdEuZIdxL5 via @cricbuzz @RPSupergiants @KKRiders #IPL', '#RPSvKKR Predict What will be the outcome? #ipl #KKRvRPS #ipl #Smithy #Gambhir 21']

In [None]:
preprocessed_vocabulary = dict()

In [None]:
#Converting to lowercase
def to_lower_case(text):
  text = text.lower()
  return text

def remove_at_word(text):
  data = text.split()
  data = [d for d in data if d[0]!='@']
  text = ' '.join(data)
  return text

def remove_hashtag(text):
  data = text.split()
  data = [d if (d[0]!='#' or len(d) == 1) else d[1:] for d in data]
  data = [d for d in data if d[0]!='#']
  text = ' '.join(data)
  return text

def remove_URL(text):
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r'bit.ly\S+', '', text, flags=re.MULTILINE)
  return text

#Removing stopwords
def remove_stopwords(text):
  stopword = stopwords.words('english')
  new_list = [x for x in text.split() if x not in stopword]
  return ' '.join(new_list)

#Removing punctuations
def remove_punctuations(text):
  punctuations = '''!()-[|]`{};:'"\,<>./?@#$=+%^&*_~'''
  new_list = ['' if x in punctuations else x for x in text.split()]
  new_list_final = []
  for token in new_list:
    new_token=""
    for char in token:
      if(char not in punctuations):
        new_token+=char
    if(len(new_token)!=0):
      new_list_final.append(new_token)
  return ' '.join(new_list_final)

#Tokenization
def tokenization(text):
  return word_tokenize(text)

def pre_process(text):
  global preprocessed_vocabulary
  text = to_lower_case(text)
  text = remove_at_word(text)
  text = remove_hashtag(text)
  text = remove_URL(text)
  text = remove_stopwords(text)
  text = remove_punctuations(text)
  text = tokenization(text)
  for token in text:
    if token in preprocessed_vocabulary.keys():
      preprocessed_vocabulary[token] += 1
    else:
      preprocessed_vocabulary[token] = 1
  return text

In [None]:
preprocessed_data = [pre_process(text) for text in dataset]

In [None]:
print(preprocessed_vocabulary)

{'excited': 1, 'todays': 1, 'ipl': 7, 'contest': 1, 'rps': 2, 'vs': 2, 'kkr': 3, 'fight': 1, 'poll': 1, 'score': 2, '50': 1, 'today': 2, 'smithy': 2, 'dhoni': 1, 'stokes': 1, 'rahane': 1, 'kkrvrps': 3, 'rpsvskkr': 1, 'cricketlovers': 1, 'ipl2017': 1, 'happy': 1, 'team': 1, 'decided': 1, 'rest': 1, 'ncn': 1, 'prime': 1, 'form': 1, 'seek': 1, 'extend': 1, 'unbeaten': 1, 'run': 1, 'pune': 1, 'via': 1, 'rpsvkkr': 1, 'predict': 1, 'outcome': 1, 'gambhir': 1, '21': 1}


In [None]:
AOF_coefficient = sum(preprocessed_vocabulary.values())/len(preprocessed_vocabulary)
vocabulary = {token.strip():preprocessed_vocabulary[token] for token in preprocessed_vocabulary.keys() if preprocessed_vocabulary[token] > AOF_coefficient and len(token.strip())}

In [None]:
print(vocabulary)

{'ipl': 7, 'rps': 2, 'vs': 2, 'kkr': 3, 'score': 2, 'today': 2, 'smithy': 2, 'kkrvrps': 3}


In [None]:
final_tokens_per_tweet = []
for data in preprocessed_data:
  final_tokens_per_tweet.append([token for token in data if token in vocabulary.keys()])

print(preprocessed_data)
print(final_tokens_per_tweet)

[['excited', 'todays', 'ipl', 'contest', 'rps', 'vs', 'kkr', 'vs', 'fight', 'ipl'], ['poll', 'score', '50', 'score', 'today', 'smithy', 'dhoni', 'stokes', 'rahane', 'kkrvrps', 'rpsvskkr', 'cricketlovers', 'ipl', 'ipl2017'], ['rps', 'happy', 'team', 'today', 'kkr', 'decided', 'rest', 'ncn', 'prime', 'form', 'kkrvrps', 'ipl'], ['kkr', 'seek', 'extend', 'unbeaten', 'run', 'pune', 'via', 'ipl'], ['rpsvkkr', 'predict', 'outcome', 'ipl', 'kkrvrps', 'ipl', 'smithy', 'gambhir', '21']]
[['ipl', 'rps', 'vs', 'kkr', 'vs', 'ipl'], ['score', 'score', 'today', 'smithy', 'kkrvrps', 'ipl'], ['rps', 'today', 'kkr', 'kkrvrps', 'ipl'], ['kkr', 'ipl'], ['ipl', 'kkrvrps', 'ipl', 'smithy']]


In [None]:
word2id = dict()
id2word = dict()
vocabulary_size = len(vocabulary)
count = 0
for token in vocabulary.keys():
  word2id[token] = count
  id2word[count] = token
  count += 1

print(word2id)
print(id2word)

{'ipl': 0, 'rps': 1, 'vs': 2, 'kkr': 3, 'score': 4, 'today': 5, 'smithy': 6, 'kkrvrps': 7}
{0: 'ipl', 1: 'rps', 2: 'vs', 3: 'kkr', 4: 'score', 5: 'today', 6: 'smithy', 7: 'kkrvrps'}


In [None]:
directed_graph_adjacency_matrix = np.zeros((vocabulary_size, vocabulary_size))
edge_weight_matrix = np.zeros((vocabulary_size, vocabulary_size))
first_frequency = dict()
last_frequency = dict()
term_frequency = vocabulary
strength = dict()
degree = dict()
selective_centraility = dict()


for tweet in tqdm(final_tokens_per_tweet):

  if tweet[0] in first_frequency.keys():
    first_frequency[tweet[0]] += 1
  else:
    first_frequency[tweet[0]] = 1

  if tweet[-1] in last_frequency.keys():
    last_frequency[tweet[-1]] += 1
  else:
    last_frequency[tweet[-1]] = 1
  


  for i in range(len(tweet)-1):
    if tweet[i] == tweet[i+1]:
      continue
    x = word2id[tweet[i]]
    y = word2id[tweet[i+1]]
    directed_graph_adjacency_matrix[x][y] += 1

for tweet in tqdm(final_tokens_per_tweet):
  for i in range(len(tweet)-1):


    if tweet[i] == tweet[i+1]:
      continue
    x = word2id[tweet[i]]
    y = word2id[tweet[i+1]]

  # Updating degree..
    if tweet[i] in degree.keys():
      degree[tweet[i]] += 1
    else:
      degree[tweet[i]] = 1
      
    if tweet[i+1] in degree.keys():
      degree[tweet[i+1]] += 1
    else:
      degree[tweet[i+1]] = 1

    edge_weight_matrix[x][y] = directed_graph_adjacency_matrix[x][y]/(vocabulary[tweet[i]] + vocabulary[tweet[i+1]] - directed_graph_adjacency_matrix[x][y])

    if tweet[i] in strength.keys():
      strength[tweet[i]] += edge_weight_matrix[x][y]
    else:
      strength[tweet[i]] = edge_weight_matrix[x][y]




first_frequency = {token:(first_frequency[token]/vocabulary[token] if token in first_frequency else 0) for token in vocabulary.keys()}
last_frequency = {token:(last_frequency[token]/vocabulary[token] if token in last_frequency else 0) for token in vocabulary.keys()}
degree = {token:(degree[token] if token in degree else 0) for token in vocabulary.keys()}
strength = {token:(strength[token] if token in strength else 0) for token in vocabulary.keys()}
selective_centraility = {token:(strength[token]/degree[token] if degree[token]!=0 else 0) for token in vocabulary.keys()}

100%|██████████| 5/5 [00:00<00:00, 29873.96it/s]
100%|██████████| 5/5 [00:00<00:00, 18608.27it/s]


In [None]:
print(degree)
print(vocabulary)

{'ipl': 8, 'rps': 3, 'vs': 4, 'kkr': 5, 'score': 1, 'today': 4, 'smithy': 3, 'kkrvrps': 6}
{'ipl': 7, 'rps': 2, 'vs': 2, 'kkr': 3, 'score': 2, 'today': 2, 'smithy': 2, 'kkrvrps': 3}


In [None]:
maxdegree = max(degree.items(), key=lambda x: x[1])[1]
max_degree_nodes_with_freq = {key:term_frequency[key] for key in degree.keys() if degree[key] == maxdegree}
maxfreq = max(max_degree_nodes_with_freq.items(), key=lambda x: x[1])[1]
central_node_name = [key for key in max_degree_nodes_with_freq.keys() if max_degree_nodes_with_freq[key] == maxfreq][0]
print("central node: ", central_node_name)

central node:  ipl


In [None]:
# bfs
distance_from_central_node = dict()
central_node_id = word2id[central_node_name]
q = [(central_node_id, 0)]

# Set source as visited
distance_from_central_node[central_node_name] = 0

while q:
    vis = q[0]
    # Print current node
    print(id2word[vis[0]], vis[1])
    q.pop(0)
      
    # For every adjacent vertex to
    # the current vertex
    for i in range(len(directed_graph_adjacency_matrix[vis[0]])):
        if (directed_graph_adjacency_matrix[vis[0]][i] == 1 and (id2word[i] not in distance_from_central_node.keys())):
            # Push the adjacent node
            # in the queue
            q.append((i, vis[1]+1))
            distance_from_central_node[id2word[i]] = vis[1]+1

print(distance_from_central_node)
inverse_distance_from_central_node = {token:(1/distance_from_central_node[token] if token in distance_from_central_node and token != central_node_name else 0) for token in vocabulary.keys()}
inverse_distance_from_central_node[central_node_name] = 1.0
print(inverse_distance_from_central_node)

ipl 0
rps 1
smithy 1
kkrvrps 1
vs 2
today 2
kkr 3
{'ipl': 0, 'rps': 1, 'smithy': 1, 'kkrvrps': 1, 'vs': 2, 'today': 2, 'kkr': 3}
{'ipl': 1.0, 'rps': 1.0, 'vs': 0.5, 'kkr': 0.3333333333333333, 'score': 0, 'today': 0.5, 'smithy': 1.0, 'kkrvrps': 1.0}


In [None]:
neighbour_importance = dict()

for i in range(len(directed_graph_adjacency_matrix)):
  neighbours = set()

  # traversing outgoing edges
  for j in range(len(directed_graph_adjacency_matrix)):
    if i == j:
      continue
    if directed_graph_adjacency_matrix[i][j] > 0:
      neighbours.add(j)
  for j in range(len(directed_graph_adjacency_matrix)):
     if i == j:
      continue
     if directed_graph_adjacency_matrix[j][i] > 0:
        neighbours.add(j)
  if len(neighbours) != 0:
    neighbour_importance[id2word[i]] = sum([strength[id2word[j]] for j in neighbours])/len(neighbours)
  else:
    neighbour_importance[id2word[i]] = 0
    
print(neighbour_importance)

{'ipl': 0.6276984126984126, 'rps': 0.4398148148148148, 'vs': 0.5296296296296296, 'kkr': 0.6512896825396826, 'score': 0.5833333333333333, 'today': 0.4527777777777778, 'smithy': 0.7433862433862434, 'kkrvrps': 0.3907407407407408}


In [None]:
unnormalized_node_weight = {node: (first_frequency[node] + last_frequency[node] + term_frequency[node] + selective_centraility[node] + inverse_distance_from_central_node[node] + neighbour_importance[node]) for node in vocabulary.keys()}
max_node_weight = max(unnormalized_node_weight.items(), key=lambda x: x[1])[1]
min_node_weight = min(unnormalized_node_weight.items(), key=lambda x: x[1])[1]
print("max node weight: ", max_node_weight, "min node weight: ", min_node_weight)
normalized_node_weight = {node: ((unnormalized_node_weight[node] - min_node_weight)/(max_node_weight - min_node_weight) if max_node_weight != min_node_weight else unnormalized_node_weight[node]) for node in unnormalized_node_weight.keys()}
print("Unnormalized score: ", unnormalized_node_weight)
print("Normalized score: ", normalized_node_weight)

max node weight:  9.52998015873016 min node weight:  3.0986111111111114
Unnormalized score:  {'ipl': 9.52998015873016, 'rps': 4.162037037037037, 'vs': 3.1233796296296297, 'kkr': 4.430178571428572, 'score': 3.416666666666667, 'today': 3.0986111111111114, 'smithy': 4.326719576719577, 'kkrvrps': 4.605026455026455}
Normalized score:  {'ipl': 1.0, 'rps': 0.16534985289323675, 'vs': 0.003851204671218141, 'kkr': 0.20704261417099346, 'score': 0.04945378708648396, 'today': 0.0, 'smithy': 0.19095599343084227, 'kkrvrps': 0.23422934258033795}


In [None]:
damping_factor = 0.85
relevance_of_node = {node: np.random.uniform(0,1,1)[0] for node in vocabulary.keys()}
threshold = 0.000000001


print(relevance_of_node)

count = 0
while True:
  count += 1
  current_relevance_of_node = dict()
  for node in vocabulary.keys():
    outer_sum = 0
    node_idx = word2id[node]
    for j in range(len(directed_graph_adjacency_matrix)):
      if j == node_idx:
        continue
      if directed_graph_adjacency_matrix[j][node_idx] > 0:
        den_sum = 0
        for k in range(len(directed_graph_adjacency_matrix)):
          if k == j:
            continue
          den_sum += directed_graph_adjacency_matrix[j][k]
        outer_sum += ((directed_graph_adjacency_matrix[j][node_idx]/den_sum) * relevance_of_node[id2word[j]])
    current_relevance_of_node[node] = (1-damping_factor)*normalized_node_weight[node] + damping_factor*normalized_node_weight[node]*outer_sum
  

  # checking convergence..
  sq_error = sum([(current_relevance_of_node[node] - relevance_of_node[node])**2 for node in vocabulary.keys()])
  relevance_of_node = current_relevance_of_node
  if sq_error < threshold:
    break

print(relevance_of_node)
print(count)

{'ipl': 0.7103288528750432, 'rps': 0.05578237780724982, 'vs': 0.29588928573735207, 'kkr': 0.20964259931506024, 'score': 0.23801881520407897, 'today': 0.9920450562636448, 'smithy': 0.9209308688858173, 'kkrvrps': 0.18787532903134696}
{'ipl': 0.209298750775006, 'rps': 0.03460902545948005, 'vs': 0.0006682853809892396, 'kkr': 0.031115198480412807, 'score': 0.007418068062972595, 'today': 0.0, 'smithy': 0.03996859218589993, 'kkrvrps': 0.059049366830692354}
10


In [None]:
degree_centrality  = {node: 0 for node in vocabulary.keys()}

if len(directed_graph_adjacency_matrix) > 1:
  for i in range(len(directed_graph_adjacency_matrix)):
    count = 0
    for j in range(len(directed_graph_adjacency_matrix)):
      if i == j:
        continue
      if directed_graph_adjacency_matrix[j][i] > 0:
        count += 1
    degree_centrality[id2word[i]] = count / (len(directed_graph_adjacency_matrix)-1)

print(degree_centrality)

{'ipl': 0.42857142857142855, 'rps': 0.14285714285714285, 'vs': 0.2857142857142857, 'kkr': 0.2857142857142857, 'score': 0.0, 'today': 0.2857142857142857, 'smithy': 0.2857142857142857, 'kkrvrps': 0.42857142857142855}


In [None]:
final_keyword_rank = [{'node': node, 'NE_rank': relevance_of_node[node], 'Degree': degree_centrality[node]} for node in vocabulary.keys()]
for data in final_keyword_rank:
  print(data)

print("-----------")
final_keyword_rank = sorted(final_keyword_rank, key = lambda i: (i['NE_rank'], i['Degree']), reverse = True)
for data in final_keyword_rank:
  print(data)

{'node': 'ipl', 'NE_rank': 0.209298750775006, 'Degree': 0.42857142857142855}
{'node': 'rps', 'NE_rank': 0.03460902545948005, 'Degree': 0.14285714285714285}
{'node': 'vs', 'NE_rank': 0.0006682853809892396, 'Degree': 0.2857142857142857}
{'node': 'kkr', 'NE_rank': 0.031115198480412807, 'Degree': 0.2857142857142857}
{'node': 'score', 'NE_rank': 0.007418068062972595, 'Degree': 0.0}
{'node': 'today', 'NE_rank': 0.0, 'Degree': 0.2857142857142857}
{'node': 'smithy', 'NE_rank': 0.03996859218589993, 'Degree': 0.2857142857142857}
{'node': 'kkrvrps', 'NE_rank': 0.059049366830692354, 'Degree': 0.42857142857142855}
-----------
{'node': 'ipl', 'NE_rank': 0.209298750775006, 'Degree': 0.42857142857142855}
{'node': 'kkrvrps', 'NE_rank': 0.059049366830692354, 'Degree': 0.42857142857142855}
{'node': 'smithy', 'NE_rank': 0.03996859218589993, 'Degree': 0.2857142857142857}
{'node': 'rps', 'NE_rank': 0.03460902545948005, 'Degree': 0.14285714285714285}
{'node': 'kkr', 'NE_rank': 0.031115198480412807, 'Degree':

In [None]:
np.random.uniform(-1,0,1)[0]

-0.8363891837073725

In [None]:
for i in range(len(edge_weight_matrix)):
  for j in range(len(edge_weight_matrix)):
    print(id2word[i], id2word[j], edge_weight_matrix[i][j], end= " | ")
  print()