## Preprocessing of text data

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

from sklearn.model_selection import train_test_split

In [2]:
# read data
df = pd.read_csv('../data/twitter/train.csv', encoding='latin-1', names=['target', 'ids', 'date', 'flag', 'user', 'text'])

# split training data randomly, stratifying the sentiments to ensure we can do sentiment analysis later
X_subset, _, y_subset, _ = train_test_split(
    df[['text']],
    df[['target']],
    train_size=0.00625, # use only 10k rows out of 1.6 million rows
    stratify=df[['target']],
    random_state=42 # set random state for reproducibility
)

contents = list(X_subset['text']) # use only the X_subset for training all the models

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yjiah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yjiah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yjiah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# split text into sentences by fullstop and lowercase everything up to 20000 sentences
sentences_lst = [tweet.lower() for tweet in contents]

# remove URLs
sentences_no_urls = [re.sub(r'http\S+|www\S+', '', sentence) for sentence in sentences_lst]

# remove hashtags and usernames
sentences_no_tags = [re.sub(r'@\w+|#\w+', '', sentence) for sentence in sentences_no_urls]

# remove punctuation
sentences_no_punc = [re.sub(r'[^\w\s]', '', sentence) for sentence in sentences_no_tags]

# tokenize the text
tokenized_sentences = [sentence.split() for sentence in sentences_no_punc]

# stopword removal
stop_words = set(stopwords.words('english'))
filtered_tokenized_sentences = [[word for word in sentence if word not in stop_words] for sentence in tokenized_sentences]

# lemmatization
lemmatizer = WordNetLemmatizer()
filtered_tokenized_sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in filtered_tokenized_sentences]

In [5]:
# create final dataset
X_subset['processed_text'] = filtered_tokenized_sentences
y_subset['target'] = y_subset['target'].apply(lambda x: 0 if x == 0 else 1) # sentiment is bad if 0, sentiment is good (1) if 4

final_dataset = pd.concat(
    [
        y_subset,
        X_subset,
    ],
    axis=1
).reset_index(drop=True)

In [6]:
final_dataset.head()

Unnamed: 0,target,text,processed_text
0,0,"still sitting under the dryer, my neck hurts","[still, sitting, dryer, neck, hurt]"
1,1,@sarahshah this is my nightmare (even tho i on...,"[nightmare, even, tho, 3, post]"
2,0,"@mjvarela black is good... tight, or should I ...","[black, good, tight, say, tight, good]"
3,0,Takes forever for everybody to get ready.,"[take, forever, everybody, get, ready]"
4,0,@Bklyncookie omg all the LA bad weather aura i...,"[omg, la, bad, weather, aura, trickling, bay, ..."


In [7]:
# train test split on final dataset, then export
train, test = train_test_split(
    final_dataset,
    train_size=0.8,
    random_state=42
)

In [8]:
# export train and test datasets

train.to_csv('../data/twitter/train_processed.csv', index=False)
test.to_csv('../data/twitter/test_processed.csv', index=False)