# File 05: Preprocessing User Timeline DataFrame

This file does exactly what you think it does. Preprocessing and a lot of it. Firstly we need to make sure the tweets we feed into the model to run prediction on are in the correct format. We also decided to put a limit on the number of tweets a user should have. Here we are only considering users which have tweets in the range of 100 to 200 as it should give us more accuracy while predicting its accuracy. We also remove any tweets which have less than 3 words in it. 

### Input Files:
- 03-user-tweets-english-only.csv

### Output Files:
- 05-shortlisted-tweets.csv
- 05-shortlisted-usernames.csv

### Steps:
1. loading required libraries
1. read user timeline tweets from dataframe
1. create functions that will preprocess the dataset
1. preprocessing timeline tweets
1. making a list of all usernames
1. counting tweets by each user
1. shortlisting users with tweet count between 100 and 200
1. making final list of tweets and users
1. creating dataframes
1. saving dataframes

In [1]:
# loading required python libraries...
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artemis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# read user timeline tweets from dataframe
df = pd.read_csv('../db/03-user-tweets-english-only.csv' )
user = df.USER.values.tolist()
tweet = df.TWEET.values.tolist()

In [3]:
# create functions that will preprocess the dataset
porter = PorterStemmer()
sw = stopwords.words('english')
sw.remove('not')

def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('', text)

def remove_single_chars(text) :
    array = text.split()
    return (" ".join([w for w in array if len(w) > 1]))

def remove_stopwords(text) :
    text = " ".join([word for word in text.split() if word not in sw])
    return text

def preprocess_text(sen) :
    sentence = remove_tags(sen)
    sentence = sentence.lower()
    sentence = re.sub('@[A-Za-z]+[A-Za-z0-9-_]+', '', sentence)
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    sentence = remove_stopwords(sentence)
    sentence = remove_single_chars(sentence)
    return sentence

In [None]:
# preprocessing timeline tweets...
user = df.USER.values.tolist()
tweet = df.TWEET.values.tolist()
processed = []
counts = []

for index in tqdm(range(len(df))) :
    text = preprocess_text(tweet[index])
    processed.append(text)
    counts.append(len(text.split()))


temp_df = pd.DataFrame(
    list(zip(user, tweet, processed)),
    columns = ['USER', 'ORIGINAL', 'PROCESSED']
)

In [None]:
# making a list of all usernames
username = []
for x in tqdm(range(len(user))):
    if user[x] not in username :
        username.append(user[x])

In [None]:
# counting tweets by each user
np_user = np.array(user)
tweetcount = []
for searchval in tqdm(username) :
    lst = list(np.where(np_user == searchval)[0])
    tweetcount.append(len(lst))

In [None]:
# shortlisting users with tweet count between 100 and 200
shortlist = []
for x in tqdm(range(len(username))) :
    if (tweetcount[x] >= 100) and (tweetcount[x] <= 200) :
        shortlist.append(username[x])

In [None]:
# making final list of tweets and users
final_user = []
final_tweet = []
final_original = []
for x in tqdm(range(len(user))) :
    if user[x] in shortlist :
        final_user.append(user[x])
        final_tweet.append(processed[x])
        final_original.append(tweet[x])

In [None]:
# creating dataframes
final = pd.DataFrame(list(zip(final_user, final_tweet, final_original)), columns=['USER', 'TWEET', 'ORIGINAL'])
username = pd.DataFrame(shortlist, columns=['USER'])

In [None]:
# saving dataframes
final.to_csv('../db/05-shortlisted-tweets.csv', index=False)
username.to_csv('../db/05-shortlisted-usernames.csv', index=False)