In [2]:
import numpy as np
import pandas as pd

# This is for making some large tweets to be displayed
pd.options.display.max_colwidth = 100
train_data = pd.read_csv("Tweets/train.csv", encoding='ISO-8859-1')

 Visualize the tweets

In [3]:
# We will now take a look at random tweets
# to gain more insights

rand_indexs = np.random.randint(1,len(train_data),20).tolist()
train_data["SentimentText"][rand_indexs]

25204    @adambarber YES! OF COURSE! You guys should have a show in south eastern Pennsylvania one of the...
67270                                            @BobChoat If the guts don't feel good...nothing feels good 
65422                             @BlackConfetti Nope, but you've got me playing For What It's Worth again! 
93903                             @cootiepoop  For sure post pics!  Jealous they aren't stopping in Dallas. 
67244    @brockter Better than other replies you could get, yes?   Hopefully, she's having a spectacular ...
75069                    @carleefaggot spam the shit outa #RampageCrew &lt;&lt;&lt;&lt; that for us yes ??? 
50029                                         @Artmaker let me know when you're done.. I'll help promote it 
45824                                                         @anz_rocks19 lol Did the yoghurt help at all? 
70023    @Andrew_Dew1992 you know im sure theres more people out there who think petras challenged, the l...
14573              

## Data Preporocessing

Emoticons

In [4]:
# We are gonna find what emoticons are used in our dataset
# Importing Python's regular expression module
import re  
tweets_text = train_data.SentimentText.str.cat()        # Concatinate text from coloumn into a single string
emos = set(re.findall(r" ([xX:;][-']?.) ",tweets_text)) # Extracting Emoticons, Use of Raw string 
emos_count = []
for emo in emos:                                        # Counting Emoticon Occurrences
    emos_count.append((tweets_text.count(emo), emo))
sorted(emos_count,reverse=True)

[(3281, ':/'),
 (2874, 'x '),
 (2626, ': '),
 (1339, 'x@'),
 (1214, 'xx'),
 (1162, 'xa'),
 (984, ';3'),
 (887, 'xp'),
 (842, 'xo'),
 (713, ';)'),
 (483, 'xe'),
 (431, ';I'),
 (353, ';.'),
 (254, 'xD'),
 (251, 'x.'),
 (245, '::'),
 (234, 'X '),
 (217, ';t'),
 (209, ';s'),
 (185, ':O'),
 (176, ':3'),
 (166, ';D'),
 (159, ":'"),
 (157, 'XD'),
 (146, 'x3'),
 (142, ':p'),
 (126, ":'("),
 (118, ':@'),
 (117, 'xh'),
 (117, ':S'),
 (109, 'xm'),
 (104, ';p'),
 (104, ';-)'),
 (92, ':|'),
 (91, 'x,'),
 (89, ';P'),
 (76, 'xd'),
 (75, ';o'),
 (75, ';d'),
 (71, ':o'),
 (65, 'XX'),
 (63, ':L'),
 (59, 'Xx'),
 (59, ':1'),
 (58, ':]'),
 (57, ':s'),
 (56, ':0'),
 (54, 'XO'),
 (44, ';;'),
 (43, ';('),
 (38, ':-D'),
 (37, 'xk'),
 (36, 'XT'),
 (35, 'x?'),
 (35, 'x)'),
 (34, 'x2'),
 (33, ';/'),
 (32, 'x:'),
 (32, ':\\'),
 (31, 'x-'),
 (27, 'Xo'),
 (27, 'XP'),
 (27, ':-/'),
 (26, ':-P'),
 (25, ':*'),
 (23, 'xX'),
 (22, ":')"),
 (17, 'xP'),
 (16, ':['),
 (16, ':-p'),
 (14, 'x]'),
 (14, 'XM'),
 (13, ':-O'),
 (1

In [5]:
# Classifying Emoticons
HAPPY_EMO = r" ([xX;:]-?[dD)]|:-?[\)]|[;:][pP]) "
SAD_EMO = r" (:'?[/|\(]) "
print("Happy emoticons:", set(re.findall(HAPPY_EMO, tweets_text)))
print("Sad emoticons:", set(re.findall(SAD_EMO, tweets_text)))

Happy emoticons: {';-D', 'x)', 'xD', ':p', ':d', ';d', ';p', ':D', 'xd', ':-D', ';P', ';-)', ';D', ';)', 'XD'}
Sad emoticons: {":'(", ':/', ':(', ':|'}


Most Used Words

In [6]:
import nltk
from nltk.tokenize import word_tokenize
# A function for taking a text as input and output words sorted according to their frequency, starting with the most used word.
def most_used_words(text):
    tokens = word_tokenize(text)
    frequency_dist = nltk.FreqDist(tokens)
    print("There is %d different words" % len(set(tokens)))
    return sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)

In [7]:
# nltk.download('punkt')
# nltk.download("stopwords")
# nltk.download('wordnet')
# nltk.download('punkt')

In [9]:
most_used_words(train_data.SentimentText.str.cat())[:10]

There is 128930 different words


['@', '!', '.', 'I', ',', 'to', 'the', 'you', '?', 'a']

## Stop Words

In [10]:
from nltk.corpus import stopwords

mw = most_used_words(train_data.SentimentText.str.cat())
most_words = []
for w in mw:
    if len(most_words) == 1000:
        break
    if w in stopwords.words("english"):
        continue
    else:
        most_words.append(w)

There is 128930 different words


#### Stemming

In [11]:
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
import nltk

# Stemming involves cutting off the prefixes or suffixes of words to obtain the root or base form, known as the stem. The stem may not be a valid word itself
def stem_tokenize(text):
    stemmer = SnowballStemmer("english")
    return [stemmer.stem(token) for token in word_tokenize(text)]

# Lemmatization involves reducing words to their base or dictionary form, known as the lemma. The lemma is a valid word that represents the base meaning of the original word.
def lemmatize_tokenize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text)] 


In [12]:
stem_tokenize('I am Zain, I am doing a projet on NLP')

['i', 'am', 'zain', ',', 'i', 'am', 'do', 'a', 'projet', 'on', 'nlp']

In [13]:
lemmatize_tokenize(' I am Zain, I am doing a projet on NLP')

['I', 'am', 'Zain', ',', 'I', 'am', 'doing', 'a', 'projet', 'on', 'NLP']

## Bag of words
Bag of Words algorithm, which basically takes a text as input, extract words from it (this is our vocabulary) to use them in the vectorization process. When a tweet comes in, it will vectorize it by counting the number of occurrences of each word in our vocabulary.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [15]:
# Building Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline

In [16]:
# preprocessing of the tweets.
# deleting useless strings (like @, # ...)
# because they will not help
# in determining if the person is Happy/Sad

class TextPreProc(BaseEstimator,TransformerMixin):
    def __init__(self, use_mention=False):
        self.use_mention = use_mention
    
    def fit(self, X, y=None):
        return self
    
    # Transformer
    def transform(self, X, y=None):
        # We can choose between keeping the mentions or deleting them. if use_mention is true keep it otherewisie delete it 
        if self.use_mention:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", " @tags ")
        else:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", "")
            
        # Keeping only the word after the #
        X = X.str.replace("#", "")
        X = X.str.replace(r"[-\.\n]", "")
        # Removing HTML garbage
        X = X.str.replace(r"&\w+;", "")
        # Removing links
        X = X.str.replace(r"https?://\S*", "")
        # replace repeated letters with only two occurences
        # heeeelllloooo => heelloo
        X = X.str.replace(r"(.)\1+", r"\1\1")
        # mark emoticons as happy or sad
        X = X.str.replace(HAPPY_EMO, " happyemoticons ")
        X = X.str.replace(SAD_EMO, " sademoticons ")
        X = X.str.lower()
        return X

In [17]:
# This is the pipeline that will transform our tweets to something eatable.
# For stop words, we let the inverse document frequency do the job

from sklearn.model_selection import train_test_split

sentiments = train_data['Sentiment']
tweets = train_data['SentimentText']

# ngram range : feature matrix will include individual words (unigrams) as well as pairs of consecutive words (bigrams) from the input text.
# Use of a term frequency-inverse document frequency (TF-IDF) vectorizer. It converts a collection of raw documents (tweets in this case) into a matrix of TF-IDF features.

vectorizer = TfidfVectorizer(tokenizer=lemmatize_tokenize, ngram_range=(1,2))  

# sequentially applying a list of transformations. Two steps: 1. Text preoporcessing 2.vectorization
pipeline = Pipeline([
    ('text_pre_processing', TextPreProc(use_mention=True)),
    ('vectorizer', vectorizer),
])
# Splitting Data
learn_data, test_data, sentiments_learning, sentiments_test = train_test_split(tweets, sentiments, test_size=0.3)

# This will tranform our learning data from simple text to vector by going through the preprocessing tranformer.
learning_data = pipeline.fit_transform(learn_data)



## Select Model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

lr = LogisticRegression()
bnb = BernoulliNB()
mnb = MultinomialNB()

models = {
    'logitic regression': lr,
    'bernoulliNB': bnb,
    'multinomialNB': mnb,
}

for model in models.keys():
    scores = cross_val_score(models[model], learning_data, sentiments_learning, scoring="f1", cv=10)
    print("===", model, "===")
    print("scores = ", scores)
    print("mean = ", scores.mean())
    print("variance = ", scores.var())
    models[model].fit(learning_data, sentiments_learning)
    print("score on the learning data (accuracy) = ", accuracy_score(models[model].predict(learning_data), sentiments_learning))
    print("")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

=== logitic regression ===
scores =  [0.80765067 0.8084595  0.81389222 0.80943121 0.81728927 0.81302571
 0.81845345 0.81228997 0.81437704 0.80864865]
mean =  0.8123517684080914
variance =  1.2803496176256075e-05


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


score on the learning data (accuracy) =  0.8770430906389302

=== bernoulliNB ===
scores =  [0.78836162 0.78637201 0.78911565 0.78617811 0.7878439  0.78411569
 0.80073294 0.78827061 0.78760757 0.78582968]
mean =  0.7884427772108846
variance =  1.8756199477690115e-05
score on the learning data (accuracy) =  0.9308635272602583

=== multinomialNB ===
scores =  [0.80428067 0.80801665 0.80592755 0.80773054 0.81435017 0.80880088
 0.81443186 0.81006262 0.80913504 0.80652629]
mean =  0.8089262260491135
variance =  9.967175034225488e-06
score on the learning data (accuracy) =  0.9199337067093382



### MultinomialNB Selected

## Fine tune the model
Use of the GridSearchCV to choose the best parameters to use. 
What the GridSearchCV does is trying different set of parameters, and for each one, it runs a cross validation and estimate the score. At the end we can see what are the best parameter and use them to build a better classifier.

In [None]:
from sklearn.model_selection import GridSearchCV


grid_search_pipeline = Pipeline([
    ('text_pre_processing', TextPreProc()),
    ('vectorizer', TfidfVectorizer()),
    ('model', MultinomialNB()),
])

params = [
    {
        'text_pre_processing__use_mention': [True, False],
        'vectorizer__max_features': [1000, 2000, 5000, 10000, 20000, None],
        'vectorizer__ngram_range': [(1,1), (1,2)],
    },
]
grid_search = GridSearchCV(grid_search_pipeline, params, cv=5, scoring='f1')
grid_search.fit(learn_data, sentiments_learning)
print(grid_search.best_params_)

{'text_pre_processing__use_mention': True, 'vectorizer__max_features': None, 'vectorizer__ngram_range': (1, 2)}


## Test

In [None]:
mnb.fit(learning_data, sentiments_learning)

In [None]:
testing_data = pipeline.transform(test_data)
mnb.score(testing_data, sentiments_test)

0.7523085641897523

## Achived Accuracy 0.75, Not Bad!!

In [None]:
# Predecting on the test.csv
sub_data = pd.read_csv("Tweets/test.csv", encoding='ISO-8859-1')
sub_learning = pipeline.transform(sub_data.SentimentText)
sub = pd.DataFrame(sub_data.ItemID, columns=("ItemID", "Sentiment"))
sub["Sentiment"] = mnb.predict(sub_learning)
print(sub)

        ItemID  Sentiment
0            1          0
1            2          0
2            3          1
3            4          0
4            5          0
...        ...        ...
299984  299996          1
299985  299997          1
299986  299998          1
299987  299999          1
299988  300000          1

[299989 rows x 2 columns]


In [23]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
# Create a Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the learning data (presumably training data)
model.fit(learning_data, sentiments_learning)

# Take user input for a new tweet
tweet = pd.Series([input(),])

# Transform the input tweet using the pre-processing pipeline
tweet = pipeline.transform(tweet)

# Use the trained model to predict probabilities for the sentiment of the input tweet
proba = model.predict_proba(tweet)[0]

# Print the predicted probabilities
print("The probability that this tweet is sad is:", proba[0])
print("The probability that this tweet is happy is:", proba[1])


The probability that this tweet is sad is: 0.8468335449291714
The probability that this tweet is happy is: 0.15316645507082907


# LSTM

In [25]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import re



In [48]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Splitting Data
learn_data, test_data, sentiments_learning, sentiments_test = train_test_split(
    train_data['SentimentText'], train_data['Sentiment'], test_size=0.3
)

# Using the pipeline to preprocess and vectorize the training data
learning_data = pipeline.fit_transform(learn_data)

# Filter out Neutral sentiment from training data only
filtered_data = learn_data[sentiments_learning != "Neutral"]
filtered_sentiments = sentiments_learning[sentiments_learning != "Neutral"]

# Convert text to lowercase
filtered_data = filtered_data.apply(lambda x: x.lower())

# Remove non-alphanumeric characters
filtered_data = filtered_data.apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))

# Display positive and negative sentiment sizes
print("Negative (Sad) Sentiment Size:", (filtered_sentiments == 0).sum())
print("Positive (Happy) Sentiment Size:", (filtered_sentiments == 1).sum())

# Replace 'rt' with a space
for idx, row in filtered_data.items():
    filtered_data[idx] = row.replace('rt', ' ')

# Tokenization and Padding
max_features = 2000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(filtered_data.values)
X = tokenizer.texts_to_sequences(filtered_data.values)
X = pad_sequences(X)





Negative (Sad) Sentiment Size: 30501
Positive (Happy) Sentiment Size: 39491


Positive Sentiment Size: 0
Negative Sentiment Size: 0
Length of filtered_data: 69992
Length of filtered_sentiments: 69992
Sample of filtered_sentiments: 80361    0
12469    0
31348    1
4507     0
98866    1
Name: Sentiment, dtype: int64
Class Distribution in filtered_sentiments:
Sentiment
1    39623
0    30369
Name: count, dtype: int64


In [49]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 85, 128)           256000    
                                                                 
 spatial_dropout1d_1 (Spati  (None, 85, 128)           0         
 alDropout1D)                                                    
                                                                 
 lstm_1 (LSTM)               (None, 196)               254800    
                                                                 
 dense_1 (Dense)             (None, 2)                 394       
                                                                 
Total params: 511194 (1.95 MB)
Trainable params: 511194 (1.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [51]:
Y = pd.get_dummies(sentiments_learning).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(46894, 85) (46894, 2)
(23098, 85) (23098, 2)


In [52]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7


1466/1466 - 297s - loss: 0.5355 - accuracy: 0.7254 - 297s/epoch - 203ms/step
Epoch 2/7
1466/1466 - 302s - loss: 0.4785 - accuracy: 0.7675 - 302s/epoch - 206ms/step
Epoch 3/7
1466/1466 - 296s - loss: 0.4590 - accuracy: 0.7802 - 296s/epoch - 202ms/step
Epoch 4/7
1466/1466 - 343s - loss: 0.4426 - accuracy: 0.7880 - 343s/epoch - 234ms/step
Epoch 5/7
1466/1466 - 344s - loss: 0.4289 - accuracy: 0.7975 - 344s/epoch - 234ms/step
Epoch 6/7
1466/1466 - 335s - loss: 0.4143 - accuracy: 0.8041 - 335s/epoch - 228ms/step
Epoch 7/7
1466/1466 - 389s - loss: 0.4014 - accuracy: 0.8139 - 389s/epoch - 265ms/step


<keras.src.callbacks.History at 0x1aa81606dd0>

In [53]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

675/675 - 28s - loss: 0.5182 - accuracy: 0.7611 - 28s/epoch - 41ms/step
score: 0.52
acc: 0.76


In [54]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

1/1 - 1s - 822ms/epoch - 822ms/step
1/1 - 0s - 73ms/epoch - 73ms/step
1/1 - 0s - 123ms/epoch - 123ms/step
1/1 - 0s - 131ms/epoch - 131ms/step
1/1 - 0s - 101ms/epoch - 101ms/step
1/1 - 0s - 80ms/epoch - 80ms/step
1/1 - 0s - 77ms/epoch - 77ms/step
1/1 - 0s - 71ms/epoch - 71ms/step
1/1 - 0s - 78ms/epoch - 78ms/step
1/1 - 0s - 110ms/epoch - 110ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 76ms/epoch - 76ms/step
1/1 - 0s - 82ms/epoch - 82ms/step
1/1 - 0s - 78ms/epoch - 78ms/step
1/1 - 0s - 84ms/epoch - 84ms/step
1/1 - 0s - 152ms/epoch - 152ms/step
1/1 - 0s - 147ms/epoch - 147ms/step
1/1 - 0s - 176ms/epoch - 176ms/step
1/1 - 0s - 61ms/epoch - 61ms/step
1/1 - 0s - 73ms/epoch - 73ms/step
1/1 - 0s - 88ms/epoch - 88ms/step
1/1 - 0s - 80ms/epoch - 80ms/step
1/1 - 0s - 62ms/epoch - 62ms/step
1/1 - 0s - 71ms/epoch - 71ms/step
1/1 - 0s - 76ms/epoch - 76ms/step
1/1 - 0s - 82ms/epoch - 82ms/step
1/1 - 0s - 63ms/epoch - 63ms/step
1/1 - 0s - 71ms/epoch - 71ms/step
1/1 - 0s - 98ms/epoch - 98ms/ste

In [66]:
twt = ['He scored Runs.']
#vectorizing the tweet by the pre-fitted tokenizer instance
twt = tokenizer.texts_to_sequences(twt)
#padding the tweet to have exactly the same shape as `embedding_2` input
twt = pad_sequences(twt, maxlen=28, dtype='int32', value=0)
print(twt)
sentiment = model.predict(twt,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0 86]]
1/1 - 0s - 60ms/epoch - 60ms/step
positive
