<a href="https://colab.research.google.com/github/u2200579/Bot-Detection/blob/main/BotDetection2020Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 4.8 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.0.0.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 4.4 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.0.0-py3-none-any.whl size=193022 sha256=22154dee31bf0b63de0d80af3cbbe3a1783152391d07c2c471f16e74c047f5a1
  Stored in directory: /root/.cache/pip/wheels/ec/29/4d/3cfe7452ac7d8d83b1930f8a6205c3c9649b24e80f9029fc38
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.0.0


In [None]:
import numpy as np
import pandas as pd
import emoji
import json
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
from sklearn.metrics.pairwise import cosine_similarity
import gensim
import gensim.downloader
import nltk  
nltk.download('punkt')
from tqdm import tqdm
tqdm.pandas()
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
!cp "/content/drive/My Drive/train.json" "train.json"
!cp "/content/drive/My Drive/dev.json" "dev.json"
!cp "/content/drive/My Drive/test.json" "test.json"

In [None]:
df = pd.read_json("test.json")#read json file
json_struct=json.loads(df['profile'].to_json(orient='records'))
dfs1 = pd.DataFrame(json_struct) #flatten nested dictionary of profile into its own dataframe
dfs1 = dfs1.rename(columns={"id": "ID"})#rename dataframe column 

In [None]:
sid_obj = SentimentIntensityAnalyzer() #variable for sentiment calculator
def subject_calc(text): #function to calculate subjectivity of tweets
    try:
        return TextBlob(text).sentiment.subjectivity 
    except:
        return 0

def sentiment_calc(text): #function to calculate sentiment of tweets
    try:
        return  sid_obj.polarity_scores(text)
    except:
        return 0
        
def mean_embeddings(s):
    """Transfer a list of words into mean embedding"""
    return np.mean([glove_vec.get_vector(x) for x in s if x in glove_vec], axis=0) #function to extract word embedding for words in tweets

In [None]:
glove_vec = gensim.downloader.load('glove-twitter-25')#load in word embeddings



In [None]:
df2=df.explode('tweet') #create a row for each tweet 
df3 = df2.drop(["profile", 'neighbor','domain',	'label'], axis=1) #Remove other columns beside ID 
df2['tweet'] = df2['tweet'].fillna('').apply(str) #convert tweet to string value
df3['mentions'] = df3['tweet'].str.count("@") #count the number of @'s contained in each tweet
df3['hashtags'] = df3['tweet'].str.count("#") #count the number of #
df3['urls'] = df3['tweet'].str.count("https") #count the number of URL links
df3['retweets'] = df3['tweet'].str.count("RT") #count the number of RT in each tweet
df3['subjectivity_score'] = df3['tweet'].apply(subject_calc) #Apply function calculating subjectivity of an tweet to each tweet 
df3['sentiment_score'] = df3['tweet'].apply(sentiment_calc) #Apply function calculating sentiment scores of an tweet to each tweet 
df1 = pd.concat([df3.drop(['sentiment_score'], axis=1), df3['sentiment_score'].apply(pd.Series)], axis=1) #create new dataframe splitting dictionary of sentiment scores of each tweet into own column to be used as features

In [None]:
df1 = df1.groupby("ID").mean().reset_index()#to find average of each tweet content and semantic information for each user

In [None]:
import operator #used to build vocabulary to check similarity between word embeddings and tweet tokens

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab
  
sentences = df2['tweet'].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 199863/199863 [00:01<00:00, 185562.66it/s]
100%|██████████| 199863/199863 [00:01<00:00, 119877.46it/s]


{'RT': 66265, '@clevelanddotcom:': 1, 'Three': 101, 'Ohio': 65, 'House': 721}


In [None]:
def preprocessing (tweet):
  #Remove RT
  tweet2 = re.sub(r'^RT[\s]+', '', tweet)
  #  Replace hyperlinks with URL
  tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', ' url ', tweet2)
  #  Replace hashtags with 'hashtag' 
  tweet2 = re.sub(r'#', ' hashtag ', tweet2)
  # Replace @User with user
  tweet2 = re.sub(r'@[A-Za-z0-9]+', ' user ', tweet2)
  #Remove \n from text
  tweet2 = re.sub(r'\n', '', tweet2)
  #Replace date values with date
  tweet2 = re.sub(r'(\d+/\d+/\d+)', 'date', tweet2)
  #Remove the 
  tweet2 = re.sub(r'_', ' ', tweet2)
  tweet2 = re.sub(r'-', ' ', tweet2)
  # Replace numeric terms in the tweet with 'number'.
  tweet2 = re.sub(r'/^[+-]?((\d+(\.\d*)?)|(\.\d+))$/', ' number ', tweet2)
  return (tweet2)


def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {"i’m":"i am",
                "isn’t":"is not",
                "it’s":"it is",
                "don’t":"do not",
                "can’t":"can not",
                "we're":"we are",
                "that's":"that is",
                "i've":"i have",
                "you’re":"you are",
                "he’s":"he is",
                "couldn't":"could not",
                "wouldn't":"would not",
                "shouldn't":"should not",
                "ain't":"are not",
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

def clean_numbers(x):
    #replace number from text with #
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    x = re.sub('[0-9]', '#', x)
    return x

def clean_text(x):
   #remove punctuation from text
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

df2['tweet'] = df2['tweet'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " "))) #apply emoji transformer
df2['tweet'] = df2['tweet'].progress_apply(lambda x: replace_typical_misspell(x)) #apply preprocessing concatenated words into seperate words
df2['tweet'] = df2['tweet'].apply(preprocessing) #apply preprocessing
df2['tweet'] = df2['tweet'].apply(lambda x: clean_text(x))
df2['tweet'] = df2['tweet'].progress_apply(lambda x: clean_numbers(x))  #apply preprocessing to transform numbers
df2['tweets'] = df2['tweet'].apply(lambda x:x.lower()) #lower text
df2['tweets'] = df2['tweets'].progress_apply(lambda x: word_tokenize(x)) #tokenise text

100%|██████████| 199863/199863 [00:04<00:00, 45509.52it/s]
100%|██████████| 199863/199863 [00:01<00:00, 149808.11it/s]
100%|██████████| 199863/199863 [00:35<00:00, 5605.60it/s]


used to change text as close as possible to word embeddings https://www.kaggle.com/code/christofhenkel/how-to-preprocessing-when-using-embeddings/notebook

In [None]:
vocab = build_vocab(df2['tweets']) 
oov = check_coverage(vocab,glove_vec) # compares all of tweets with words in word embedding 

100%|██████████| 199863/199863 [00:01<00:00, 147283.56it/s]
100%|██████████| 200653/200653 [00:00<00:00, 262968.43it/s]

Found embeddings for 51.87% of vocab
Found embeddings for  95.46% of all text





In [None]:
oov[:10]#provides list of words that are in tweets and not in word embedding

In [None]:
df2["Text_Sim"] = df2['tweets'].progress_apply(lambda x: mean_embeddings(x))
h = []
df2 = df2.dropna(subset=['Text_Sim'])
s = df2.groupby("ID")
h = s['Text_Sim'].apply(np.stack).apply(cosine_similarity).apply(np.mean).reset_index()
# extract 'embeddings' for each group
#  .apply(np.stack) # turns sequence of arrays into proper matrix
#  .apply(cosine_similarity) # compute pairwise similarity matrix
#  .apply(np.mean) # get the mean

  out=out, **kwargs)
100%|██████████| 199863/199863 [00:17<00:00, 11665.71it/s]


In [None]:
new_df = df[['ID', 'label']]
dfs1['ID'] = dfs1['ID'].astype("float").astype("Int64")
df1 = pd.merge(df1,h,on=["ID"])
df1 = pd.merge(dfs1,df1,on=["ID"])
new_df = pd.merge(df1,new_df,on=["ID"])
new_df = new_df.dropna(axis=1, how='all')
new_df #create new dataframe containing preprocessed tweet content and semantic features and combine with dataframe containing user profile and labels for account

In [None]:
new_df.to_csv('/content/drive/My Drive/testset.csv',encoding='utf-8',index=False) #upload new dataframe into csv file for modeling