In [None]:
# Import libraries
import re
import nltk
import string
import flair
import pandas as pd
from nltk.corpus import stopwords
pd.set_option('display.max_colwidth', None)


In [None]:
model = flair.models.TextClassifier.load('en-sentiment')


In [None]:
# Read the CSV file
columns = ["sentiment", "ID", "datetime", "query", "username", "text"]
df = pd.read_csv('sentiment_dataset.csv', delimiter=',',
                 encoding="ISO-8859-1", names=columns)
df.head()


In [None]:
# Extract sentiment and text column, they will be relevant in this analysis
df = df[['sentiment', 'text']]
df.head()


In [None]:
# Fuction that performs cleaning using re library.
# It removes uppercase, brackets, links, punctuation etc
def cleaning(a):
    a = str(a).lower()
    a = re.sub('\[.*?\]', '', a)
    a = re.sub('[%s]' % re.escape(string.punctuation), '', a)
    a = re.sub('[()!?]', ' ', a)
    a = re.sub('\n', '', a)
    a = re.sub('https?://\S+|www\.\S+', '', a)
    a = re.sub(r'^\s*$', '', a)
    a = re.sub('<.*?>+', '', a)
    a = re.sub('\w*\d\w*', '', a)
    return a


df['text'] = df['text'].apply(cleaning)
df.head()


In [None]:
# Checking for balance
df['sentiment'].value_counts()


In [None]:
# Removing stop words i.e., the, is, and, or, in, this etc
s_words = stopwords.words('english')


def removing(text):
    a = ' '.join(i for i in text.split(' ') if i not in s_words)
    return a


df['text'] = df['text'].apply(removing)
df.head()


In [None]:
# Change 0 to negative and 4 to positive
class_dict = {0: 'negative', 4: 'positive'}
df['sentiment'] = df['sentiment'].apply(lambda x:  class_dict[x])
df.head()


In [None]:
# Stemming words that have same meaning
stemmer = nltk.SnowballStemmer("english")


def stemming(text):
    text = ' '.join(stemmer.stem(i) for i in text.split(' '))
    return text


df['text'] = df['text'].apply(stemming)
df.head()


In [None]:
# Stripping off extra spaces
df['text'] = df['text'].str.strip()
df.head()


## Using Flair

In [None]:
sentiment = []
confidence = []

for text in df['text']:
    if text.strip() == "":
        sentiment.append("")
        confidence.append("")

    sample = flair.data.Sentence(text)
    model.predict(sample)

    if len(sample.labels) > 0:
        sentiment.append(sample.labels[0].value)
        confidence.append(sample.labels[0].score)
    else:
        # handle case where no label was predicted
        sentiment.append("")
        confidence.append("")


In [None]:
df['pred_sentiment'] = sentiment
df['confidence'] = confidence


In [None]:
df.head(10)


In [None]:
# Split the data to train data and test data
X = df['text']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
