In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

from sklearn.model_selection import train_test_split

In [2]:
# read data
df = pd.read_csv('../data/twitter/train.csv', encoding='latin-1', names=['target', 'ids', 'date', 'flag', 'user', 'text'])

# split training data randomly, stratifying the negative samples to ensure we can do sentiment analysis later
X_train, X_test, y_train, y_test = train_test_split(
    df[['text']],
    df[['target']],
    train_size=0.03125, # use only 50k rows out of 1.6 million rows
    stratify=df[['target']],
    random_state=42 # set random state for reproducibility
)

contents = list(X_train['text']) # use only the X_train for training all the models

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yjiah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yjiah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yjiah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# split text into sentences by fullstop and lowercase everything up to 20000 sentences
sentences_lst = [tweet.lower() for tweet in contents]

# remove punctuation
sentences_lst_removed_punc = [re.sub(r'[^\w\s]', '', sentence) for sentence in sentences_lst]

# tokenize the split sentences into words
tokenized_sentences = [sentence.split() for sentence in sentences_lst_removed_punc]

# stopword removal
stop_words = set(stopwords.words('english'))
filtered_tokenized_sentences = [[word for word in sentence if word not in stop_words] for sentence in tokenized_sentences]

# lemmatization
lemmatizer = WordNetLemmatizer()
filtered_tokenized_sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in tokenized_sentences]

In [5]:
# export final dataset for use in the future by other models
X_train['processed_text'] = filtered_tokenized_sentences
y_train['target'] = y_train['target'].apply(lambda x: 0 if x == 0 else 1) # sentiment is bad if 0, sentiment is good (1) if 4

final_dataset = pd.concat(
    [
        y_train,
        X_train,
    ],
    axis=1
).reset_index(drop=True)

final_dataset.to_csv('../data/twitter/processed.csv', index=False)