<a href="https://colab.research.google.com/github/v1umahmo/HonoursProject/blob/main/HonoursCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Necessary Libraries

In [None]:
#For data processing
import pandas as pd
#For linear algebra
import numpy as np
#Regular expression library
import re
import nltk

### Import the Training and Testing Data

In [None]:
train = pd.read_csv("") # Import training data from file path
test = pd.read_csv("") # Import test data from file path

### Look At The Training Data

In [None]:
train.info() # Print training data info

In [None]:
train.head(10) # Print the first ten tweets from the training data

In [None]:
train.tail(10) # Print the last ten tweets from the training data

### Clean The Training Data

In [None]:
# For loop to go through each tweet for the data cleaning
def remove_pattern(input_txt, pattern):
  r = re.findall(pattern, input_txt)
  for i in r:
    input_txt = re.sub(i, ' ', input_txt)
  return input_txt

In [None]:
# Get Rid of the user handles
train['cleaned_tweets'] = np.vectorise(remove_pattern)(train['tweet'], "@[\w]*")

In [None]:
train.head(10) # Print the first ten tweets from the training data with their twitter handles removed

In [None]:
# Make tweets lower case
train['cleaned_tweets'] = train['cleaned_tweets'].apply(lambda x: x.lower())

In [None]:
train.head(10) # Print the first ten tweets from the training data with the tweets lower case

In [None]:
# Remove hashtags and special characters
train['cleaned_tweets'] = train['cleaned_tweets'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))

In [None]:
train.head(10)

In [None]:
# Remove numbers etc.
train['cleaned_tweets'] = train['cleaned_tweets'].apply(lambda x: re.sub(r'[^a-zA-z]',' ',x))

In [None]:
train.head(10)

In [None]:
# Import libraries for tokenization and stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# Create tokenization for the cleaned tweets
train['tokenized_tweets'] = train['cleaned_tweets'].apply (lambda x: word_tokenize(x))

In [None]:
train.head(10)

In [None]:
# Use this to remove words such as if, but, can etc. (Words that have no value)
stop_words = set(stopwords.words('english'))
stop_words

In [None]:
# Apply stopwords to the tokenized tweets
train['tokenized_tweets_filtered'] = train['tokenized_tweets'].apply(lambda x: [word for word in x if not word in stop_words])

In [None]:
train.head(10)

In [None]:
#Import library for stemming
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

In [None]:
#Stem the tweets and create another colim for tweets that are stemmed
train['stemmed_tweets'] = train['tokenized_tweets_filtered'].apply(lambda x: ' '.join([stemming.step(i) for i in x]))

In [None]:
train.head(10)

In [None]:
#Import library for lemmatizing
from nltk.stem.wordnet import WordnetLemmatizer
lemmatizing = WordnetLemmatizer()

In [None]:
# Lemmatize the tweets and create another column for tweets that are lemmatized
train['lemmatized_tweets'] = train['tokenized_tweets_filtered'].apply(lambda x: ' ' .join([lemmatizing.lemmatize(i) for i in x]))

In [None]:
train.head(10)

### Apply Feature Extraction

In [None]:
# Import library
from sklearn.feature_extraction.text import CountVectorizer

### Clean The Test Data

In [None]:
test.info()

In [None]:
test.head(10)

In [None]:
test.tail(10)

In [None]:
# Get Rid of the user handles
test['cleaned_tweets'] = np.vectorise(remove_pattern)(test['tweet'], "@[\w]*")

In [None]:
test.head(10)

In [None]:
# Make tweets lower case
test['cleaned_tweets'] = test['cleaned_tweets'].apply(lambda x: x.lower())

In [None]:
test.head(10)

In [None]:
test['cleaned_tweets'] = test['cleaned_tweets'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))

In [None]:
test.head(10)

In [None]:
test['cleaned_tweets'] = test['cleaned_tweets'].apply(lambda x: re.sub(r'[^a-zA-z]',' ',x))

In [None]:
test.head(10)

In [None]:
# Import libraries for tokenization and stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# Create tokenization for the cleaned tweets
test['tokenized_tweets'] = test['cleaned_tweets'].apply (lambda x: word_tokenize(x))

In [None]:
test.head(10)

In [None]:
stop_words = set(stopwords.words('english'))
stop_words

In [None]:
test['tokenized_tweets_filtered'] = test['tokenized_tweets'].apply(lambda x: [word for word in x if not word in stop_words])

In [None]:
test.head(10)

In [None]:
#Import library for stemming
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

In [None]:
#Stem the tweets and create another colim for tweets that are stemmed
test['stemmed_tweets'] = test['tokenized_tweets_filtered'].apply(lambda x: ' '.join([stemming.step(i) for i in x]))

In [None]:
test.head(10)

In [None]:
#Import library for lemmatizing
from nltk.stem.wordnet import WordnetLemmatizer
lemmatizing = WordnetLemmatizer()

In [None]:
test['lemmatized_tweets'] = test['tokenized_tweets_filtered'].apply(lambda x: ' ' .join([lemmatizing.lemmatize(i) for i in x]))

In [None]:
test.head(10)