<a href="https://colab.research.google.com/github/vasishtahd/Twitter-Bot-Detection-With-Graph-Analytics/blob/master/final_node2vec_shingle_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **CSE 573: Semantic Web Mining**

![Google's logo](https://fys.asu.edu/sites/default/files/styles/panopoly_image_original/public/asu_logo_1.png?itok=pBT1SdKj)


Spring 2020

Group 4 - Project 6

Detection Of Fake Twitter Bots Through Unsupervised Learning

Members:
* Aditya Narayanan (1215232278)
* Vasishta Harekal (1215278298)
* Sumukh Ashwin Kamath (1217723013)
* Renu Kadolkar (1215200727)
* Hang Zhao (1214868408)
* Cheng Chen (1216700771)
* Shankar Krishnamoorthy (1217214947)

Code:

# Install all the required Modules.

---


Import Required Libraries.

In [24]:
!pip install -q node2vec
!pip install -q snapy
!pip install -q mmh3
!pip install -q tqdm
from tqdm import tqdm
import json

from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from google.colab import drive
from google.colab import files
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from collections import Counter
import collections
import seaborn as sn
import sys
import numpy
import pickle
import os.path
from os import path
from snapy import MinHash, LSH
import time
numpy.set_printoptions(threshold=sys.maxsize)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Check All the Global Constants

In [25]:
# Global Constants 
PREFIX = 'day'
CLUS_PREFX = 'clus'
SECONDS_IN_DAY = 3600
ACTUAL_SECONDS_IN_DAY=86400 
NO_OF_CLUSTERS = 5
PROJECT_ROOT = "/content/drive/Shared drives/SWM/SWM_data/"
time_sorted_tweet_data = pd.read_csv("/content/drive/Shared drives/SWM/timeSortedDataset.csv", lineterminator='\n')
start_time = time_sorted_tweet_data['postedEpochtime'].values[0]
drive.mount('/content/drive')
botnames_list,bot_name_dict=[],{}


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# This section forms the graph of Daily Tweet Dataframe and gives the option to save it.

In [0]:
def getGraphfromDF(dayN_tweets):
  """
  Returns the Graph From Pandas Dataframe

  Parameters
  ----------
  dayN_tweets : Pandas Dataframe
      It is the Dataframe containing all tweets in a single day.

  Returns
  -------
  G : NetworkX Graph
      Graph which has edge from every unique user to a unique Retweet ID.
  
  Created By
  ----------
 
  """

  G = nx.from_pandas_edgelist(dayN_tweets, 'screen_name_from', 'retweet_tid')
  return G

def plotAndSaveGraph(G, dayNumber, toPlot=False, saveToDrive=False):
  """
  Plots and save the Graph to Custom Drive Location

  Parameters
  ----------
  G : NetworkX Graph
      Graph which has edge from every unique user to a unique Retweet ID.
  dayNumber : int
      Nth Day
  toPlot : boolean
      Enables or Disables Plot Option
  saveToDrive : boolean
      Enables or Disables Saving to GDrive Option

  Returns
  -------
  None.
  """
  nx.draw(G,
        #with_labels=True,
        node_color="skyblue",
        node_size=3,
        width=2,
        edge_color="orange",
       )
  if toPlot:
    plt.show()
  if saveToDrive:
    print('Saving Graph Image to Drive...')
    folder_name = "SWM_dayGraphs/"
    file_name = PREFIX + str(dayNumber) +"RetweetGraph.png"
    plt.savefig(PROJECT_ROOT + str(folder_name) + str(file_name) )
    print('Saving Complete')

# This section contains Node2Vec Implementation. 

**List of References:**

*   https://snap.stanford.edu/node2vec 
*   https://github.com/eliorc/node2vec
*   https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef   
*   https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf


In [0]:
def getNode2VecData(G, dayN_tweets):
  """
  Returns the Vector Representation [Node2Vec] of the Graph.

  Parameters
  ----------
  G : NetworkX Graph
      Graph which has edge from every unique user to a unique Retweet ID.
  dayN_tweets : Pandas Dataframe
      It is the Dataframe containing all tweets in a single day.


  Returns
  -------
  vect_repr : list
      It contains the vector representation of each twitter user 
      based on the model generated by Node2Vec.
 
 
  """
  from_nodes =  dayN_tweets['screen_name_from'].values
  to_nodes = dayN_tweets['retweet_tid'].values

  node2vec = Node2Vec(G, dimensions=40, walk_length=8, num_walks=30, quiet=False, workers=5)
  model = node2vec.fit(window=10, min_count=1)

  vect_repr = []
  for user_name in from_nodes:
    vector_repres_of_user = model.wv.get_vector(str(user_name))
    vect_repr.append(vector_repres_of_user)
    vector_repres_of_user = []

  return vect_repr

# Perform Clustering for Every Day

In [0]:
def performClustering(dayNtweets, vec_reprs, dayNumber, no_of_clusters, toVisualize=False):
  """
  This method forms the correlation matrix for each cluter.

  Parameters
  ----------
  dayN_tweets : Pandas Dataframe
      It is the Dataframe containing all tweets in a single day.
  vect_repr : list
      It contains the vector representation of each twitter user 
      based on the model generated by Node2Vec.
  dayNumber : int
      Nth Day.
  no_of_clusters : int
      Number of Clusters to Form.
  saveToDrive : boolean
      Enables or Disables Saving to GDrive Option.
  toVisualize : boolean
      Enables or Disables Visualizing Clustering Option.
      # TODO: Implement This.

  Returns
  -------
  dayNtweetsByCLabels : Pandas Dataframe
      It is the Dataframe which consists of cluster label for each
      datapoint.
  
 
  """

  # vec_reprs_np = np.array(vec_reprs)
  #print("VECTOR REPRS_NP")
  #vec_reprs_np =vec_reprs
  dayNtweets['vectors_reprs'] = pd.Series(vec_reprs, index=dayNtweets.index.values)
  
  dayNtweets.drop_duplicates(subset ="screen_name_from", keep = "first", inplace = True)
  vec_reprs_np = np.array(dayNtweets['vectors_reprs'].tolist())
  #print('In Clustering- ', len(vec_reprs_np))
  clustering = KMeans(n_clusters=no_of_clusters,random_state=0).fit(vec_reprs_np)
  #print(clustering.labels_) 
  dayNtweets['clusterLabels'] = np.array(clustering.labels_)
  dayNtweetsByCLabels = dayNtweets.groupby(['clusterLabels'])
  #print(dayNtweetsByCLabels)

  #print(dayNtweetsByCLabels.head(15))
  return dayNtweetsByCLabels
  '''
  botnames_list=botnames['BotName'].tolist()
  bot_name_dict={}


  c = Counter(clustering.labels_)
  print(c.most_common())
  for bot in botnames_list:
    bot_name_dict[bot]=1
  cluster_membership={str(i):[] for i in range(no_of_clusters)}        
  for i in range(len(clustering.labels_)):
         
        if from_nodes_list1[i] in bot_name_dict:
            cluster_membership[str(clustering.labels_[i])].append(from_nodes_list1[i])

  sorted_dict = collections.OrderedDict(cluster_membership)
 
  for key, value in sorted_dict.items():
    print(key,len(value))
  if toVisualize:
    pass
  '''

# Form Correlation Matrix for Every Cluster



In [0]:
def performCorrelation(clusterNumber, dayNumber,
                       dayNtweetsByCLabels, saveToDrive=False, toVisualize=False):
  """
  This method forms the correlation matrix for each cluter.

  Parameters
  ----------
  clusterNumber : int
      Nth Cluster.
  dayNumber : int
      Nth Day.
  dayNtweetsByCLabels : Pandas Dataframe
      It is the Dataframe which consists of cluster label for each
      datapoint.
  saveToDrive : boolean
      Enables or Disables Saving to GDrive Option.
  toVisualize : boolean
      Enables or Disables Visualizing Corr Matrix Option.
  

  Returns
  -------
  None.
 
  """
  #print(dayNtweetsByCLabels['vectors_reprs'])
  vec_df = dayNtweetsByCLabels['vectors_reprs'].apply(pd.Series)
  #print(vec_df)
  user_name_lst = dayNtweetsByCLabels['screen_name_from'].values
  user_name_idx = dayNtweetsByCLabels.index.values
  user_name_dict = dict(zip(user_name_idx, user_name_lst))
  #user_name_dict= {v: k for v, k in enumerate(user_name_lst)}
  print(" Current Cluster ", clusterNumber)
  #print(user_name_dict)

  corr = vec_df.T.corr()
  corr = corr.rename(columns=user_name_dict)
  corr = corr.rename(index=user_name_dict)
  #print('Correlation Matrix')
  # print(corr)
  if saveToDrive:
    print('Saving CSV to Drive...')
    folder_name = "SWM_corrData/"
    file_name = PREFIX + str(dayNumber) + "_" + CLUS_PREFX + str(clusterNumber) + "corrMat.csv"
    corr.to_csv(PROJECT_ROOT + str(folder_name) + str(file_name))
    print('Saving Complete')


  if toVisualize:
    print('Saving CorrelationMatrix Visualization to Drive...')

    passfig, ax = plt.subplots()
    sns.heatmap(corr, annot=True, fmt='.4f', 
          cmap=plt.get_cmap('coolwarm'), cbar=False, ax=ax)
    ax.set_yticklabels(ax.get_yticklabels(), rotation="horizontal")

    
    folder_name = "SWM_dayCorrMatVis/"
    file_name = PREFIX + str(dayNumber) +"CorrMatVis.png"
    plt.savefig(PROJECT_ROOT + str(folder_name) + str(file_name), 
                bbox_inches='tight', pad_inches=0.0 )
    
    print('Saving Complete')

# This section consists of Pickle Saving Methods. Models can be saved and loaded from the pickle file to reduce computation time.

In [0]:
def saveToPickle(vector_reps, dayNumber):
  """
  Saves the Vector Representations to a Pickle File in GDrive.

  Parameters
  ----------
  vector_reps : list
      It contains the vector representation of each twitter user 
      based on the model generated by Node2Vec.
  dayNumber : int
      Nth Day.

  Returns
  -------
  None.
  
 
  """
  folder_name = "SWM_pickleVecReprs/"
  file_name = PREFIX + str(dayNumber) +"_vect_rps.p"
  pickle.dump( vector_reps, open(PROJECT_ROOT + str(folder_name) + str(file_name), "wb" ) )


def isPickleExisting(dayNumber):
  """
  Checks if the pickle for a particular day exists.
  If it exists, return True.
  Else return False.

  Parameters
  ----------
  dayNumber : int
      Nth Day.
  Returns
  -------
  None.
  
 
  """
  folder_name = "SWM_pickleVecReprs/"
  file_name = PREFIX + str(dayNumber) +"_vect_rps.p"
  file_path = PROJECT_ROOT + str(folder_name) + str(file_name)
  #print(file_path)
  if path.exists(file_path):
    return True
  else:
    return False

def loadPickle(dayNumber):
  """
  Load the pickle file based on day number.

  Parameters
  ----------
  dayNumber : int
      Nth Day.
  Returns
  -------
  Vector Representations for the dayNumber.
  """
  folder_name = "SWM_pickleVecReprs/"
  file_name = PREFIX + str(dayNumber) +"_vect_rps.p"
  file_path = PROJECT_ROOT + str(folder_name) + str(file_name)
  return pickle.load(open(str(file_path),"rb"))

# This section perform Shingles method to congregate all the near-duplicates tweets into a single retweet id.
List of References:

*  https://pypi.org/project/snapy/

*  https://medium.com/@jonathankoren/near-duplicate-detection-b6694e807f7a

*  https://minteressa.github.io/modelling/2016/06/20/near-duplicate-detection.html

*  https://pradeepprabakar.wordpress.com/2012/05/30/near-duplicate-detection-in-streaming-twitter-data/

*  http://www.softcorporation.com/products/neardup/


In [0]:
def performShingles(dayNtweet):
  """
  Returns the Graph From Pandas Dataframe

  Parameters
  ----------
  dayN_tweets : Pandas Dataframe
      It is the Dataframe containing all tweets in a single day.
  
  Returns
  -------
  near_dupl_dict : dict
      It is key-value pair, where key is the indices of the Dataframe 
      in dayN_tweets and the value is all the near duplicate found for that
      particular index datapoint.
 
  """

  tweet_content = dayNtweet['body'].values
  labels = list(dayNtweet.index)
  minhash = MinHash(tweet_content, n_gram=9, 
                        permutations=100, hash_bits=64, seed=3)
  lsh = LSH(minhash, labels, no_of_bands=50)
  near_dupl_dict = {}
  for i in labels:
    # Query to find near duplicates for text 0.
    near_dupl_dict[i] = lsh.query(int(i), min_jaccard=0.5)
  return near_dupl_dict


def replaceNearDupl(dayNtweets, near_dupl_dict):
  """
  This method replaces the near duplicates with the corresponding 
  index in the dataframe.

  Parameters
  ----------
  dayN_tweets : Pandas Dataframe
      It is the Dataframe containing all tweets in a single day.
  near_dupl_dict : dict
      It is key-value pair, where key is the indices of the Dataframe 
      in dayN_tweets and the value is all the near duplicate found for that
      particular index datapoint.


  Returns
  -------
  dayN_tweets : Pandas Dataframe
      It is the updated Dataframe where all the near duplicates have 
      been replaced with the same retweet_tid.
 
  """

  for k,v in near_dupl_dict.items():
    for itm in v:
        dayNtweets.at[itm,'retweet_tid'] = dayNtweets['retweet_tid'].loc[k]
  return dayNtweets

def replaceNearDuplOptimized(dayNtweets, near_dupl_dict):
  """
  Optimised Version.

  This method replaces the near duplicates with the corresponding 
  index in the dataframe.

  When replacing, it ignores already replaced retweet_tids 
  to reduce duplicate computation.

  Parameters
  ----------
  dayN_tweets : Pandas Dataframe
      It is the Dataframe containing all tweets in a single day.
  near_dupl_dict : dict
      It is key-value pair, where key is the indices of the Dataframe 
      in dayN_tweets and the value is all the near duplicate found for that
      particular index datapoint.


  Returns
  -------
  dayN_tweets : Pandas Dataframe
      It is the updated Dataframe where all the near duplicates have 
      been replaced with the same retweet_tid.
  
 
  """
  existingTweetSet = set()
  for k,v in near_dupl_dict.items():
    for itm in v:
      if itm not in existingTweetSet:
        dayNtweets.at[itm,'retweet_tid'] = dayNtweets['retweet_tid'].loc[k]
        existingTweetSet.add(k)
  
  return dayNtweets

# This section contains the Driver Method.

In [0]:
def start(startTime, noOfDays=sys.maxsize):
  """
  Driver Method for the project

  Parameters
  ----------
  startTime : int
      Epoch Time of the starting tweet.
  noOfDays : int
      The amount of days needed for analysis.
      Default is infinite.
      The program converges once end is reached.

  Returns
  -------
  None.
  

  """

  prev_length = 0
  for dayNumber in range(noOfDays):
    print(" Day :",dayNumber)
    next_day = (startTime) + (SECONDS_IN_DAY)
    qury = str(startTime) + ' <= postedEpochtime <= ' + str(next_day)
      
    dayN_tweets = time_sorted_tweet_data.query(qury)
    print(dayN_tweets.head(5))
    print(len(dayN_tweets['retweet_tid']  ), len(dayN_tweets['screen_name_from'] ))
    print(len(dayN_tweets['retweet_tid'].unique()), len(dayN_tweets['screen_name_from'].unique()))
    dayN_tweets = replaceNearDuplOptimized(dayN_tweets, performShingles(dayN_tweets))
    print(len(dayN_tweets['retweet_tid'].unique()))
    dayN_tweets = dayN_tweets.drop('body', axis=1)
    G = getGraphfromDF(dayN_tweets)
    ##Plot Graph | Uncomment if required 
    #plotAndSaveGraph(G, dayNumber, False, False)

    if not isPickleExisting(dayNumber):
      
      ##Vector Representation of the Graph Nodes for that Day 
      vector_reps = getNode2VecData(G, dayN_tweets)
      saveToPickle(vector_reps, dayNumber)
       
    else:
      vector_reps = loadPickle(dayNumber)
    lengthofdayNTweets = len(dayN_tweets)
    df_WithClLabel = performClustering(dayN_tweets, vector_reps, dayNumber, NO_OF_CLUSTERS, toVisualize=False)
    for cluster in range(NO_OF_CLUSTERS):
      performCorrelation(cluster, dayNumber, 
                        df_WithClLabel.get_group(cluster), 
                        saveToDrive=True, toVisualize=False)
    

    lengthOfTweets = lengthofdayNTweets + prev_length 
    prev_length = lengthOfTweets
    try:
      startTime = time_sorted_tweet_data['postedEpochtime'].values[lengthOfTweets]
      no_of_days = dayNumber
    except IndexError:
      break

# Most Correlated Accounts Evaluation




In [0]:
 def bot_list_prep():
   """
   
   Prepares the list of bots in a dict
   
   
   """

   botnames = pd.read_csv("/content/drive/Shared drives/SWM/botnames.csv", lineterminator='\n')
   botnames_list=botnames['BotName'].tolist()
   print("Total Bots:",len(botnames_list))
   bot_name_dict={}
   for bot in botnames_list:
      bot_name_dict[bot]=1
   return botnames_list,bot_name_dict
def get_cluster_correlation(dayNumber,clusterNumber):
    '''Loads the cluster correlation from drive based on day & cluster'''
    print('Loading CSV from Drive...')
    folder_name = "SWM_corrData/"
    file_name = PREFIX + str(dayNumber) + "_" + CLUS_PREFX + str(clusterNumber) + "corrMat.csv"
    correlation_df = pd.read_csv(PROJECT_ROOT + str(folder_name) + str(file_name), sep = ',', index_col=0)
    print('Loaded correlation data for day:', dayNumber, ' for cluster number:', clusterNumber)
    columns_names =list(correlation_df.columns) 
    ## Converting to numeric values from string 
    correlation_df[columns_names[1:]] = correlation_df[columns_names[1:]].apply(pd.to_numeric)
    return correlation_df

def correlation_index_filter(x):
    
    """
   
    Custom Filter 
   
   
    """

    str_x=str(x)
    arr=str_x.split(" ")
    a,b=arr[0],arr[1]
    return a!=b
def get_top_correlation_pairs(correlation_df_in, k=50, absolute=False ):
    """
    
    Gets the top 50 correlation pairs in the given Correlation Matrix
    
    """
    unstacked_correlation_df = correlation_df_in.unstack()
    series_index=unstacked_correlation_df.index
    series_index_filtered = list(filter(lambda x: x[0]!=x[1], series_index))
    unstacked_correlation_df=unstacked_correlation_df.filter(items=series_index_filtered)
    rev_sort_unstacked_correlation_df = unstacked_correlation_df.sort_values(kind="quicksort",  ascending=False )

    return rev_sort_unstacked_correlation_df [:k ]

def identify_bots_from_correlated_pairs(dayNumber,clusterNumber,topk_bots_in,botnames_list, saveToDrive=True):
    """
    
    Searches for bot in the given list of correlated pairs
    
    """
    topk_bots_in.index=topk_bots_in.index.map(lambda x: x[0])
    topk_bots_df_in=topk_bots_in.to_frame(name='accounts')
    topk_bots_df_in["isBot"]=0
    topk_bots_df_in.loc[topk_bots_df_in['accounts'].isin(botnames_list),"isBot"] =1
     
    if saveToDrive :
        print('Saving most correlated bots for day:', dayNumber, ' for cluster number:', clusterNumber)
        folder_name = "SWM_correlatedPairs/"
        file_name = PREFIX + str(dayNumber) + "_" + CLUS_PREFX + str(clusterNumber) + "correlatedPairs.csv"
        topk_bots_df_in.to_csv(PROJECT_ROOT + str(folder_name) + str(file_name))
        print('Done')
    return topk_bots_df_in

#Checkpointing Methods

In [0]:
def checkpoint_compute_status(day_number):
      """
      
      Computes the Checkpoint Status for each day
      
      """

      print('Checkpointing Status to drive..' )
      folder_name = "SWM_metadataStatus/"
      file_name = PREFIX +  "compute_status.json"
      total_path=PROJECT_ROOT + str(folder_name) + str(file_name)
      status = {}
      status['day_number'] =  day_number
      with open(total_path, 'w') as f:
          json.dump(status, f)

def get_checkpoint_compute_status():
      """
      
      Returns the Checkpoint Status for each day
      
      """
      folder_name = "SWM_metadataStatus/"
      file_name = PREFIX +  "compute_status.json"
      total_path=PROJECT_ROOT + str(folder_name) + str(file_name)
      status_file = open(total_path)
      curr_status = json.load(status_file) 
      status_file.close()
      return(curr_status)


# Day Wise Statistics Methods

In [0]:
def compute_all_day_statistics(saveToDrive=True):
  """
  

  Saves the 'start_epoch','end_epoch','no_of_tweets' for each day
  

  """
  prev_length = 0
  dayNumber=0
  startTime = time_sorted_tweet_data['postedEpochtime'].values[0]-2
  end_of_time = time_sorted_tweet_data['postedEpochtime'].values[-1]
  result=[]
  next_day=0
  diff=8640000/(end_of_time-startTime)

  with tqdm(total=100) as pbar: 
    while(startTime<end_of_time):
      
        pbar.update(diff)
        next_day = (startTime) + (ACTUAL_SECONDS_IN_DAY)
    
        qury = str(startTime) + ' <= postedEpochtime <= ' + str(next_day)
          
        dayN_tweets = time_sorted_tweet_data.query(qury)
        unique_retweet_tid=len(dayN_tweets['retweet_tid'].unique())

        
        if(unique_retweet_tid!=0):
          # print((dayNumber,startTime,next_day,unique_retweet_tid))
          result.append((dayNumber,startTime,next_day,unique_retweet_tid))
          dayNumber+=1
          
        startTime=next_day

  all_day_statistics_df = pd.DataFrame(result, columns =['day_number', 'start_epoch','end_epoch','no_of_tweets']) 

  if saveToDrive :
        print('Saving all_day_statistics to drive..' )
        folder_name = "SWM_metadataStatus/"
        file_name = PREFIX +  "all_day_statistics.csv"
        all_day_statistics_df.to_csv(PROJECT_ROOT + str(folder_name) + str(file_name) , index=False)
        print('Done')

def get_all_day_statistics():
   """
   

   Returns the statistics for each day
   
   
   """
   print('Loading all_day_statistics from drive..' )
   folder_name = "SWM_metadataStatus/"
   file_name = PREFIX +  "all_day_statistics.csv"
   all_day_statistics_df_in=pd.read_csv(PROJECT_ROOT + str(folder_name) + str(file_name) )
   print('Done')
   return all_day_statistics_df_in 

# Compute Methods

In [0]:
def compute_by_epoch_range(dayNumber,startTime,endTime):
    """
    
    Compute the 'start_epoch','end_epoch','no_of_tweets' for each day and skipping days with less than 10 tweets
    
    """
    print(" Compute for",dayNumber,startTime,endTime)
    qury = str(startTime) + ' <= postedEpochtime <= ' + str(endTime)
    dayN_tweets = time_sorted_tweet_data.query(qury)
    dayN_tweets = replaceNearDuplOptimized(dayN_tweets, performShingles(dayN_tweets))
    dayN_tweets = dayN_tweets.drop('body', axis=1)
    G = getGraphfromDF(dayN_tweets)

    if not isPickleExisting(dayNumber):
      vector_reps = getNode2VecData(G, dayN_tweets)
      saveToPickle(vector_reps, dayNumber)
       
    else:
      vector_reps = loadPickle(dayNumber)
    G=[]
    lengthofdayNTweets = len(dayN_tweets)
    if(lengthofdayNTweets<10):
      print("Skipping for day: ", dayNumber , " as it has only lengthofdayNTweets=",lengthofdayNTweets)
      return
    df_WithClLabel = performClustering(dayN_tweets, vector_reps, dayNumber, NO_OF_CLUSTERS, toVisualize=False)
    dayN_tweets=[]
    for cluster in range(NO_OF_CLUSTERS):
      performCorrelation(cluster, dayNumber, 
                        df_WithClLabel.get_group(cluster), 
                        saveToDrive=True, toVisualize=False)
    
    print("Computation Done For Day Number:",dayNumber)
 
def compute_job_by_range(start_date,end_date):
    """

    Compute Pipeline for the given range of day values

    """
    all_day_statistics_df_in=get_all_day_statistics()
    print('Compute By Range With \n start_date:',start_date,' end_date:',end_date)
    
    for day_number in range(start_date,end_date+1):
       
        temp_holder=all_day_statistics_df_in.loc[all_day_statistics_df_in['day_number'] == day_number]
        compute_by_epoch_range(dayNumber=temp_holder.values[0][0],startTime= temp_holder.values[0][1],endTime= temp_holder.values[0][2])
        
        
def compute_all(saveStatus=True,resumeFromPrevious=True):
    """
    
    
    Computes the Checkpoint Status for each day
    
    
    """
    all_day_statistics_df_in=get_all_day_statistics()
    end_date =all_day_statistics_df_in.loc[all_day_statistics_df_in.index[-1], "day_number"] 
   
    start_date=0
    if(resumeFromPrevious):
      start_date_indx=get_checkpoint_compute_status()["day_number"]+1
      start_date=all_day_statistics_df_in.loc[all_day_statistics_df_in.index[start_date_indx], "day_number"] 


    print('Compute By All With \nstart_date:',start_date,' end_date:',end_date)
    for day_number in range(start_date,end_date+1):
       
        temp_holder=all_day_statistics_df_in.loc[all_day_statistics_df_in['day_number'] == day_number]
        compute_by_epoch_range(dayNumber=temp_holder.values[0][0],startTime= temp_holder.values[0][1],endTime= temp_holder.values[0][2])
        checkpoint_compute_status(day_number) 

In [0]:
def compute_all_topk(start_day,end_day,K,threshold=8000,apply_threshold=True,no_of_clusters=5):
      """
      

      Drive Method to call topk for all correlated pairs
      
      
      """
      botnames_list,bot_name_dict=bot_list_prep()
      for dayNumber in range (start_day,end_day):
         for clusterNumber in range (no_of_clusters):
           try:
                correlation_df=get_cluster_correlation(dayNumber,clusterNumber)
                row_cnt=correlation_df.shape[0] 
                if apply_threshold and (row_cnt   > threshold):
                   print(" Skipping for Day Number ", dayNumber, " Cluster", clusterNumber, " as number of rows :" , str(row_cnt))
                   continue
                topk_bots=get_top_correlation_pairs(correlation_df, k=K, absolute=False )
                identify_bots_from_correlated_pairs(dayNumber,clusterNumber,topk_bots,botnames_list, saveToDrive=True,)
           except Exception as e:
                print(" Failed for Day Number ", dayNumber, " Cluster", clusterNumber)
                print (e)
      print(" All Top K Correlation Pairs Calculated !")


In [0]:
def get_all_correlated_account_frequency(start_day,end_day,no_of_clusters):
      """


      Driver Method to Compute Correlation Matrix for each cluster in each day


      """

      all_topk_dict=collections.defaultdict(int)
      
      for dayNumber in range (start_day,end_day):
         for clusterNumber in range (no_of_clusters):
                try:
                    print('Loading most correlated bots for day:', dayNumber, ' for cluster number:', clusterNumber)
                    folder_name = "SWM_correlatedPairs/"
                    file_name = PREFIX + str(dayNumber) + "_" + CLUS_PREFX + str(clusterNumber) + "correlatedPairs.csv"
                    # topk_bots_df_in.to_csv(PROJECT_ROOT + str(folder_name) + str(file_name))
                    curr_topk = pd.read_csv(PROJECT_ROOT + str(folder_name) + str(file_name))
                    curr_topk_list_set=set(curr_topk.iloc[:,0].tolist())
                    for key in curr_topk_list_set:
                        all_topk_dict[key]+=1
                        

                except Exception as e:
                      print(" Failed for Day Number ", dayNumber, " Cluster", clusterNumber)
                      print (e)
                
        
      return all_topk_dict   


In [0]:


def filter_bots_from_topK(all_topk_dict):
    """


    Method to filter bots from topK in each day


    """

    botnames_list,bot_name_dict=bot_list_prep()
    sorted_topk_dict=collections.OrderedDict({k: v for k, v in sorted(all_topk_dict.items(), key=lambda item: item[1], reverse=True)})
    all_topk_df=pd.DataFrame.from_dict(sorted_topk_dict, orient='index' , columns=[ 'count'] )
    filtered=all_topk_df.loc[all_topk_df.index.isin(botnames_list) ]  
    print("filtered",len(filtered),"\n",filtered)
    folder_name = "SWM_metadataStatus/"
    file_name =    "bots_identified1.csv"
    filtered.to_csv(PROJECT_ROOT + str(folder_name) + str(file_name),  header=False) 

def save_topK_accounts(all_topk_dict):
    """


    Method to Save TopK Accounts from topK in each day


    """

    sorted_topk_dict=collections.OrderedDict({k: v for k, v in sorted(all_topk_dict.items(), key=lambda item: item[1], reverse=True)})
    all_topk_df=pd.DataFrame.from_dict(sorted_topk_dict, orient='index' , columns=[ 'count'] )


    folder_name = "SWM_metadataStatus/"
    file_name =   "all_top_accounts1.csv"
    all_topk_df.to_csv(PROJECT_ROOT + str(folder_name) + str(file_name),  header=False)

    print("all_topk_df",len(all_topk_df),"\n",all_topk_df)






# **Run from This section**

In [39]:
compute_all(saveStatus=True,resumeFromPrevious=True)

all_topk_dict=get_all_correlated_account_frequency(start_day=0,end_day=10,no_of_clusters=5)

filter_bots_from_topK(all_topk_dict)

save_topK_accounts(all_topk_dict)

Loading most correlated bots for day: 0  for cluster number: 0
Loading most correlated bots for day: 0  for cluster number: 1
Loading most correlated bots for day: 0  for cluster number: 2
Loading most correlated bots for day: 0  for cluster number: 3
Loading most correlated bots for day: 0  for cluster number: 4
Loading most correlated bots for day: 1  for cluster number: 0
Loading most correlated bots for day: 1  for cluster number: 1
Loading most correlated bots for day: 1  for cluster number: 2
Loading most correlated bots for day: 1  for cluster number: 3
Loading most correlated bots for day: 1  for cluster number: 4
Loading most correlated bots for day: 2  for cluster number: 0
Loading most correlated bots for day: 2  for cluster number: 1
Loading most correlated bots for day: 2  for cluster number: 2
Loading most correlated bots for day: 2  for cluster number: 3
Loading most correlated bots for day: 2  for cluster number: 4
Loading most correlated bots for day: 3  for cluster nu