In [None]:
# library to let the user interact with the OS that the python is running on
import os
# importing library to help with data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
# importing library to help with data analysis and manipulation
import numpy as np
# a collcetion of command style functions that makes changes to figures and plots
import matplotlib.pyplot as plt
# a function in inline-mode that allows for the display of generated plots
%matplotlib inline
# importing the 'seaborn' library that provides a interface for creating informative graphs
import seaborn as sns
# importing the Regular Expression syntax operations
import re
# importing the nltk library
import nltk
# function to stop word removal so that the nltk library won't change the frequency of words
from nltk.corpus import stopwords
# function to remove affixes from the word, and returns the word stem
from nltk.stem.porter import PorterStemmer
# function to return the input word
from nltk.stem import WordNetLemmatizer
# 'word_tokenize': function that splits a sentence into words
# 'sent_tokenize': function that tokenizes inserted text into sentences
from nltk.tokenize import word_tokenize,sent_tokenize
# function to convert text documents into matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
# 'train_test_split': function to split matrices into train and test sets
# 'cross_val_score': function to evaluate a score by cross-validation
from sklearn.model_selection import train_test_split, cross_val_score
# importing 'LogisticRegression' that allows us to perfrom machine learning with linear models
from sklearn.linear_model import LogisticRegression
# importing 'classification_report' that allows for metric report to measure classification performance
from sklearn.metrics import classification_report
# importing 'GridSearchCV' that finds the optimal value from the set that it is called upon
from sklearn.model_selection import GridSearchCV

# implementing the train set
train = '../input/covid-19-nlp-text-classification/Corona_NLP_train.csv'
# implementing the test set
test = '../input/covid-19-nlp-text-classification/Corona_NLP_test.csv'

# making a copy of the train set to retain the initial structure
trainOriginal = pd.read_csv(train, encoding='latin-1')
# making a copy of the test set to retain the initial structure
testOriginal = pd.read_csv(test, encoding='latin-1')

# I believe that the purpose of the next two coded lines are to preserve the state of the original sets
# setting the content of the test set to the copied version
train = trainOriginal.copy()
# setting the content of the test set to the copied version
test = testOriginal.copy()

# Earlier attempts to open the files with a utf-8 encoding lead to a unicode error as it couldn't
# parse certain parts of the file, hence, utf-8 was introduced as a solution.

# going to the beginning of the train set
train.head()
# going to the beginning of the test set
test.head()

# The datasets contain 7 columns housing the data info. The UserName and ScreenName has being
# encrypted due to privacy concerns. The tweets contains mentions and hashtags which must be cleaned
# in order to help the models better understand the statistical relationship between the relevant
# details. The sentiment column contains 5 different classes which can be remapped into 3 for better
# statistical understanding. The other columns are the timeframe of the tweets and the location from
# where the tweets where twitted.

# displaying information about the train set (such as columns, data types, and memory usage)
train.info()

# I believe that this line returns the number of missing values in the dataset
train.isnull().sum()

# The location column contains a whooping 8590 missing rows. Filling the blanks with the most common
# location won't really make sense as the missing details are too much.
train['Location'].value_counts()[:60]

# splitting location into word pairs in the train set
train['Location'] = train['Location'].str.split(",").str[0]
# splitting location into word pairs in the test set
test['Location'] = test['Location'].str.split(",").str[0]

# displaying the given parameters in the train set that appear at least 60 times
train['Location'].value_counts()[:60]
train['TweetAt'].value_counts()

# The data collected was tweeted between 16th March, 2020 to 14th April, 2020. Any model built and 
# deployed at this time may likely not be relevant for present use due to new findings, researches, 
# tresnd that have emerged which will influence every recent covid19 related tweets. Any model built
# using this data will be a decayed model and further decay will happen at a rapid pace.
train['Sentiment'].value_counts()

# setting the width and height of the plot
plt.figure(figsize=(10,10))
# creating a histogram using '.countplot' function
sns.countplot(y='Location',data=train,order=train.Location.value_counts().iloc[
    0:19].index).set_title("Twitted locations")
# setting the formatting of the histogram
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(11,8)})
sns.countplot(train['Sentiment'])

# creating the name of the labels for the pie chart
labels = ['Positve', 'Negative', 'Neutral', 'Extremely Positive', 'Extremely Negative']
# setting the colors of each bar in the pie chart
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99', '#ff5645']
explode = (0.05,0.05,0.05,0.05,0.05) 
# plotting the pie chart
plt.pie(train.Sentiment.value_counts(), colors = colors, labels=labels,
        autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode = explode)
# setting the center of the circle in the pie chart and coloring it white
centreCircle = plt.Circle((0,0),0.70,fc='white')
# getting the current figure in order to get the appropriate axis information
fig = plt.gcf()
fig.gca().add_artist(centreCircle)
# adjusting the parameters of the subplot
plt.tight_layout()
plt.show()

# locating the the data from the train set
plotDf = train.iloc[:,[2,5]] #[:,[2,5]] is the location and sentiment columns
# displaying the content
plotDf

# setting the size of the chart
sns.set(rc={'figure.figsize':(15,9)})
# calling upon the train set to fetch data
gg = train.Location.value_counts()[:5].index
# giving a title to the graph and setting its font-size
plt.title('Sentiment Categories of the First 5 Top Locations', fontsize=16, fontweight='bold')
# plotting the graph
sns.countplot(x = 'Location', hue = 'Sentiment', data = plotDf, order = gg)

# setting the identity value of 0. to the train set
train['Identity'] = 0
# setting the identity value of 1. to the test set
test['Identity'] = 1 
# concatenating the test and the train set
covid = pd.concat([train, test])
covid.reset_index(drop=True, inplace=True)
# going to the beginning of the now concatenated sets
covid.head()
covid['Sentiment'] = covid['Sentiment'].str.replace('Extremely Positive', 'Positive')
covid['Sentiment'] = covid['Sentiment'].str.replace('Extremely Negative', 'Negative')

# dropping the screen
covid = covid.drop('ScreenName', axis=1)
# dropping the username
covid = covid.drop('UserName', axis=1)
# displaying the newly set
covid

# in the next few coded lines, we are setting the formatting for plotting the dataset
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(11,8)})
sns.countplot(covid['Sentiment'])
# setting the name of the labels
labels = ['Positve', 'Negative', 'Neutral']
# setting the color of the bars
colors = ['lightblue','lightsteelblue','silver']
explode = (0.1, 0.1, 0.1)
# displaying the pie chart
plt.pie(covid.Sentiment.value_counts(), colors = colors, labels=labels,
        shadow=300, autopct='%1.1f%%', startangle=90, explode = explode)
plt.show()

# setting the width and height of the plot
plt.figure(figsize=(10,10))
# creating the graph
sns.countplot(y='Location',data=train,order=train.Location.value_counts().iloc[
    0:19].index).set_title("Twitted locations")
covid['Sentiment'] = covid['Sentiment'].map({'Neutral':0, 'Positive':1, 'Negative':2})
# processing and analyzing the data given the parameters
hashTags=covid['OriginalTweet'].str.extractall(r"(#\S+)")
hashTags = hashTags[0].value_counts()
hashTags[:50]
mentions = train['OriginalTweet'].str.extractall(r"(@\S+)")
mentions = mentions[0].value_counts()
mentions[:50]

# creating a function that removes hashtags, urls, mentions, digits, and stopwords
def clean(text):
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'@\w+',' ',text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub('r<.*?>',' ', text)
    text = text.split()
    text = " ".join([word for word in text if not word in stopWord])
    
    return text

# finding the common words in the english language and going to the beginning of the data set
stopWord = stopwords.words('english')
covid['OriginalTweet'] = covid['OriginalTweet'].apply(lambda x: clean(x))
covid.head()
covid = covid[['OriginalTweet','Sentiment','Identity']]
covid.head()

# tokenizing the words inside of the dataset
covid['Corpus'] = [nltk.word_tokenize(text) for text in covid.OriginalTweet]
lemma = nltk.WordNetLemmatizer()
# assigning the tokenized words into the corpus
covid.Corpus = covid.apply(lambda x: [lemma.lemmatize(word) for word in x.Corpus], axis=1)
covid.Corpus = covid.apply(lambda x: " ".join(x.Corpus),axis=1)
# going back to the beginning of the dataset
covid.head()

# splitting the data set to the train set
train = covid[covid.Identity==0]
# splitting the data set to the test set
test = covid[covid.Identity==1]
# dropping values from both the test and the train set
train.drop('Identity',axis=1, inplace=True)
test.drop('Identity',axis=1, inplace=True)
test.reset_index(drop=True,inplace=True)
# going to the beginning of the train set
train.head()
# going to the beginning of the test set
test.head()

# splitting the train set into two separate sets
XTrain = train.Corpus
yTrain = train.Sentiment
# splitting the test set into two separate sets
XTest = test.Corpus
yTest = test.Sentiment

# getting a validation test for the train set
XTrain, XVal, yTrain, yVal = train_test_split(XTrain, yTrain, test_size=0.2,random_state=42)
XTrain.shape, XVal.shape, yTrain.shape, yVal.shape, XTest.shape, yTest.shape

# transforming the texts into the numerical values
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2),min_df=5).fit(covid.Corpus)
XTrainVec = vectorizer.transform(XTrain)
XValVec = vectorizer.transform(XVal)
XTestVec = vectorizer.transform(XTest)
# running a logistic regression
logReg = LogisticRegression(random_state=42)
# seeing how accurate the logistic regression is
cross_val_score(LogisticRegression(random_state=42),
                XTrainVec, yTrain, cv=10, verbose=1, n_jobs=-1).mean()
model = logReg.fit(XTrainVec, yTrain)
# displaying the information 
print(classification_report(yVal, model.predict(XValVec)))

# finding and returning the number spaces on a log scale
penalty = ['l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)
# using the 'GridSearchCV' to fine tune the logistic regression model
logRegGrid = GridSearchCV(logReg, hyperparameters, cv=5, verbose=0)
bestModel = logRegGrid.fit(XTrainVec, yTrain)

# displaying the best hyperparameters combination
print('Best Penalty:', bestModel.best_estimator_.get_params()['penalty'])
print('Best C:', bestModel.best_estimator_.get_params()['C'])

# Final Logistic Regression model performance
yPred = bestModel.predict(XTestVec)
# displaying the model performance
print(classification_report(yTest, bestModel.predict(XTestVec)))

# INPUT PARAMETERS:
# Throughout this assignment, by reading the provided file I believe that the input parameters
# are given to be integrated with the data and information taken from the test and the train set.
# A classifer would then utilize the data to see how the inut variables relate to the dataset.

# CountVectorizer() function:
# the 'CountVectorizer' function in this assignment is meant to transform a collection of texts
# into numerical matrix of word. I believe that in this assignment specifically this is due to the fact that
# we plotted multiple differnt graphs, and we need numerical data value in order to successfully plot a graph.

# I was very unfamiliar with a lot of libraries and functions used in this assignment, so I used the following 
# resources in order to better understand certain parts:
# https://www.geeksforgeeks.org/how-to-use-matplotlib-plot-inline/
# https://pypi.org/project/matplotlib-inline/
# https://matplotlib.org/2.0.2/users/pyplot_tutorial.html
# https://www.nltk.org/api/nltk.tokenize.html
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
# https://seaborn.pydata.org/tutorial/introduction.html