In [1]:
# Import libraries
import re
import nltk
import string
import flair
import pandas as pd
from nltk.corpus import stopwords
pd.set_option('display.max_colwidth', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = flair.models.TextClassifier.load('en-sentiment')

In [3]:
# Read the CSV file
columns  = ["sentiment", "ID", "datetime", "query", "username", "text"]
df = pd.read_csv('sentiment_dataset.csv', delimiter=',', encoding = "ISO-8859-1", names = columns)
df.head()

In [None]:
# Extract sentiment and text column, they will be relevant in this analysis
df = df[['sentiment','text']]
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [None]:
# Fuction that performs cleaning using re library. 
# It removes uppercase, brackets, links, punctuation etc
def cleaning(a):
    a = str(a).lower()
    a = re.sub('\[.*?\]', '', a)
    a = re.sub('[%s]' % re.escape(string.punctuation), '', a)
    a = re.sub('[()!?]', ' ', a)
    a = re.sub('\n', '', a)
    a = re.sub('https?://\S+|www\.\S+', '', a)
    a = re.sub(r'^\s*$', '', a)
    a = re.sub('<.*?>+', '', a)
    a = re.sub('\w*\d\w*', '', a)
    return a

df['text'] = df['text'].apply(cleaning)
df.head()

Unnamed: 0,sentiment,text
0,0,switchfoot awww thats a bummer you shoulda got david carr of third day to do it d
1,0,is upset that he cant update his facebook by texting it and might cry as a result school today also blah
2,0,kenichan i dived many times for the ball managed to save the rest go out of bounds
3,0,my whole body feels itchy and like its on fire
4,0,nationwideclass no its not behaving at all im mad why am i here because i cant see you all over there


In [None]:
# Checking for balance
df['sentiment'].value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

In [None]:
# Removing stop words i.e., the, is, and, or, in, this etc
s_words = stopwords.words('english')

def removing(text):
    a = ' '.join(i for i in text.split(' ') if i not in s_words)
    return a
    
df['text'] = df['text'].apply(removing)
df.head()

Unnamed: 0,sentiment,text
0,0,switchfoot awww thats bummer shoulda got david carr third day
1,0,upset cant update facebook texting might cry result school today also blah
2,0,kenichan dived many times ball managed save rest go bounds
3,0,whole body feels itchy like fire
4,0,nationwideclass behaving im mad cant see


In [None]:
# Change 0 to negative and 4 to positive
class_dict = {0:'negative', 4:'positive'}
df['sentiment'] = df['sentiment'].apply(lambda x:  class_dict[x])
df.head()

Unnamed: 0,sentiment,text
0,negative,switchfoot awww thats bummer shoulda got david carr third day
1,negative,upset cant update facebook texting might cry result school today also blah
2,negative,kenichan dived many times ball managed save rest go bounds
3,negative,whole body feels itchy like fire
4,negative,nationwideclass behaving im mad cant see


In [None]:
# Stemming words that have same meaning
stemmer = nltk.SnowballStemmer("english")

def stemming(text):
    text = ' '.join(stemmer.stem(i) for i in text.split(' '))
    return text

df['text'] = df['text'].apply(stemming)
df.head()

Unnamed: 0,sentiment,text
0,negative,switchfoot awww that bummer shoulda got david carr third day
1,negative,upset cant updat facebook text might cri result school today also blah
2,negative,kenichan dive mani time ball manag save rest go bound
3,negative,whole bodi feel itchi like fire
4,negative,nationwideclass behav im mad cant see


In [None]:
# Stripping off extra spaces 
df['text'] = df['text'].str.strip()
df.head()

Unnamed: 0,sentiment,text
0,negative,switchfoot awww that bummer shoulda got david carr third day
1,negative,upset cant updat facebook text might cri result school today also blah
2,negative,kenichan dive mani time ball manag save rest go bound
3,negative,whole bodi feel itchi like fire
4,negative,nationwideclass behav im mad cant see


## Using Flair

In [None]:
sentiment = []
confidence = []

for text in df['text']:
    if text.strip() == "":
        sentiment.append("")
        confidence.append("")

    sample = flair.data.Sentence(text)
    model.predict(sample)

    if len(sample.labels) > 0:
        sentiment.append(sample.labels[0].value)
        confidence.append(sample.labels[0].score)
    else:
        # handle case where no label was predicted
        sentiment.append("")
        confidence.append("")




In [None]:
df['pred_sentiment'] = sentiment
df['confidence'] = confidence

In [None]:
df.head(10)

NameError: name 'df' is not defined

In [None]:
# # Split the data to train data and test data
# X = df['text']
# y = df['sentiment']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)