# PNL Preprocessing

## !! Requirements

You need at least 16GB of memory to run this notebook. It's recommended to have 32GB.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

## Spliting user data in train and test

We only divided the user dataset with SKLearn. The Tweets we filtered, based on the group that each tweet was in.

In [None]:
#spliting user data
df_tradicionalbot_us = pd.read_csv('datasets_full/traditional_spambots_1/users.csv')
df_genuine_us = pd.read_csv('datasets_full/genuine_accounts.csv/users.csv')
df_genuine_us = df_genuine_us.drop(['test_set_1','test_set_2'], axis=1)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(df_tradicionalbot_us, np.ones(df_tradicionalbot_us['id'].unique().shape[0]), random_state=42)
X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(df_genuine_us, np.zeros(df_genuine_us['id'].unique().shape[0]), random_state=42)

X_train = pd.concat([X_train_b, X_train_g])
y_train = np.hstack([y_train_b, y_train_g])

X_test = pd.concat([X_test_b, X_test_g])
y_test = np.hstack([y_test_b, y_test_g])

y_test = pd.DataFrame(y_test)
y_test.index = X_test['id'].values
y_test.to_csv("y_test.csv")
X_test.to_csv("x_test.csv")

y_train = pd.DataFrame(y_train)
y_train.index = X_train['id'].values
y_train.to_csv("y_train.csv")
X_train.to_csv("x_train.csv")

df_tradicionalbot_us, df_genuine_us, X_train_b, X_test_b, y_train_b, y_test_b, X_train_g, X_test_g, y_train_g, y_test_g, X_train, y_train, X_test, y_test = None, None, None,None,None,None,None,None,None,None,None,None,None,None

In [None]:
x_test = pd.read_csv("x_test.csv")
x_train = pd.read_csv("x_train.csv")

In [None]:
# #spliting tweet data
x_test.set_index("id",inplace=True)
x_train.set_index("id",inplace=True)

aaa = ["id","text","source","user_id","in_reply_to_user_id","num_hashtags","num_urls","num_mentions","timestamp"]
cols = ["col " + str(i) for i in range(28)]
for x,y in zip(aaa,[0,1,2,3,6,19,20,21,22]):
    cols[y] = x

gen_tweets = pd.read_csv("datasets_full/genuine_accounts.csv/tweets.csv",error_bad_lines=False,quotechar='"',encoding='UTF-8', names=cols)
gen_tweets = gen_tweets[aaa]

def func(x):
    try:
        a = int(x)
        return True
    except:
        return False

gen_tweets = gen_tweets[gen_tweets['user_id'].apply(lambda x: func(x))]
gen_tweets['user_id'] = gen_tweets['user_id'].apply(lambda x: int(x))

cols = ["id","text","source","user_id","truncated","in_reply_to_status_id","in_reply_to_user_id","in_reply_to_screen_name","retweeted_status_id","geo","place","contributors","retweet_count","reply_count","favorite_count","favorited","retweeted","possibly_sensitive","num_hashtags","num_urls","num_mentions","created_at","timestamp","crawled_at","updated"]

spam_tweets = pd.read_excel("datasets_full/traditional_spambots_1/tweets.xlsx") #I opened the tweets.csv at Excel, and then saved as xlsx. Was what permitted me to open this with Pandas.
spam_tweets = spam_tweets[aaa]

spam_tweets = spam_tweets[spam_tweets['user_id'].apply(lambda x: func(x))]
spam_tweets['user_id'] = spam_tweets['user_id'].apply(lambda x: int(x))

merged = spam_tweets.append(gen_tweets) 

train_index = set(x_train.index)
tweet_train = merged[merged['user_id'].apply(lambda x: x in train_index)]

test_index = set(x_test.index)
tweet_test = merged[merged['user_id'].apply(lambda x: x in test_index)]

tweet_test.to_csv("x_test_small.csv")
tweet_train.to_csv("x_train_small.csv")

In [None]:
x_train, x_test, aaa, cols, gen_tweets, spam_tweets, merged, train_index, tweet_train, test_index, tweet_test = None, None,None,None,None,None,None,None,None,None,None 

## Pre-processing Tweets

The section bellow is very hard to run. There are sections that consume 14GB of memory. Mind it.

In [None]:
df = pd.read_csv("x_train_small.csv")
df['is_reply'] = df['in_reply_to_user_id'].apply(lambda x: 0 if x == 0 else 1)
df.drop(["source","timestamp","Unnamed: 0","id",'in_reply_to_user_id'],axis=1,inplace=True)
df.dropna(subset=['text'], inplace=True)
tweets = df['text']

We're going to use the libraries NLTK and Gensim to generate the most important tokens from the tweets. It's called lemmatize in the library. Then, we create a gensim dictionary, with only 200 words. It's very time consuming this section of the code, so we didn't fell the need of selecting this hyper-parameter, because the results were satisfing.

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
stemmer = SnowballStemmer("english")

In [None]:
processed_docs = tweets.map(preprocess)
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
dictionary.filter_extremes(no_above=0.5, keep_n=200)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
X = pd.DataFrame(tweets)
X['ID'] = X.index

class aaa: ##just a dummy class just to create the ID Collumn. 
    def __init__(self):
        self.count = 0

    def func(self,x):
        ans = self.count
        self.count += 1
        return ans

a = aaa()
X['ID'] =X['ID'].apply(lambda x: a.func(x))

def calcula(word_index: int, tweet_index: int) -> int:
    try:
        list_tuplas = bow_corpus[tweet_index]
        for i, count in list_tuplas:
            if i == word_index:
                return count
        return 0
    except: #used for debugging
        print(word_index)
        print(tweet_index)
        raise Exception

for word_index in range(200):
    x = pd.DataFrame(X['ID'].apply(lambda tweet_index: calcula(word_index,tweet_index))) #for each tweet and for each word we match the previusly calculated data in the bow_corpus data structure
    x.to_pickle(str("words/" + dictionary[word_index]) + ".pkl") #here we save it to a pickle to help our memory 
    print(word_index)

In [None]:
for word_index in range(200):
    df[dictionary[word_index]] = pd.read_pickle("words/" + dictionary[word_index] + ".pkl") #just here we add all to the dataset
    print(word_index)

df.to_csv("df.csv")

In [None]:
 df = None

In [None]:
#same thing, for the test data
df_test = pd.read_csv("x_test_small.csv")  
df_test['is_reply'] = df_test['in_reply_to_user_id'].apply(lambda x: 0 if x == 0 else 1)
df_test.drop(["source","timestamp","Unnamed: 0","id",'in_reply_to_user_id'],axis=1,inplace=True)
df_test.dropna(subset=['text'], inplace=True)
tweets = df_test['text']

processed_docs = tweets.map(preprocess)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
X = pd.DataFrame(tweets)
a = aaa()
X['ID'] =X['text'].apply(lambda x: a.func(x))

for word_index in range(200):
    x = pd.DataFrame(X['ID'].apply(lambda tweet_index: calcula(word_index,tweet_index)))
    x.to_pickle(str("words_test/" + dictionary[word_index]) + ".pkl")
    print(word_index)

for word_index in range(200):
    df_test[dictionary[word_index]] = pd.read_pickle("words/" + dictionary[word_index] + ".pkl")
    print(word_index)

df_test.to_csv("df_test.csv")

## In the section bellow, we group the tweets by user using the mean.

In [None]:
df = pd.read_csv("df.csv")
x_train = pd.read_csv("x_train_small.csv")
df['text'] = x_train['text']
x_train = None
gb = df.groupby("user_id").mean()
y_train = pd.read_csv("y_train.csv")
y_train.columns = ["user_id","ans"]
gb = gb.merge(y_train,on="user_id",how="left")
gb.to_csv("ready_to_train.csv")

In [None]:
df = pd.read_csv("df_test.csv")
x_train = pd.read_csv("x_test_small.csv")
df['text'] = x_train['text']
x_train = None
gb = df.groupby("user_id").mean()
y_train = pd.read_csv("y_test.csv")
y_train.columns = ["user_id","ans"]
gb = gb.merge(y_train,on="user_id",how="left")
gb.to_csv("ready_to_test.csv")

At this section, the tweets are ready to model.