In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from wordcloud import WordCloud,STOPWORDS
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import sklearn

import seaborn as sns

## READING THE CSV FILE

In [3]:
df=pd.read_csv("../input/sentiment140/training.1600000.processed.noemoticon.csv",encoding = "ISO-8859-1",
              names=["target", "ids", "date", "flag", "user", "text"])

In [4]:
df.head()

In [5]:
df.describe()

In [6]:
df["target"].value_counts()

## This dataset is balanced.The target indicates the sentiment of the tweet ,0 means negative while 4 means positive

In [7]:
sentiment={0:"negative",4:"positive"}
df["sentiment"]=df["target"].map(lambda x: sentiment[x])

In [8]:
df.head()

In [9]:
df["sentiment"].value_counts()

## label encoding target

In [10]:
labelencode={0:0,4:1}
df["target"]=df["target"].map(lambda x:labelencode[x])

In [11]:
df.head()

In [12]:
df["target"].value_counts()

## Lets learn more about data.This dataset contains:
* target    : integer value representing sentiment of tweet(0 means negative while 1 means positive) 
* ids       : an integer to uniquely identify every tweet
* date      : date on which tweet was posted
* flag      : 
* user      : twitter handle of the one who posted the tweet
* text      : actual tweet
* sentiment : string representing the sentiment of tweet

### Since this is a text classification problem so,i am going to drop all features except target,text and sentiment

In [13]:
df=df.loc[:,["text","sentiment","target"]]

In [14]:
df.head()

## Preprocessing the tweets

In [15]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

stop_words = set(stopwords.words('english'))

def clean_text(text):
    
    s=text.split(' ')
    
    s=[w.lower() for w in s]
    
    table = str.maketrans('', '', string.punctuation)
    s = [w.translate(table) for w in s]
    s = [word for word in s if word.isalpha()]
    
    s = [w for w in s if not w in stop_words]
    
#     snow_stemmer = SnowballStemmer(language='english')
    lemmatizer = WordNetLemmatizer()
    s = [lemmatizer.lemmatize(word) for word in s]
    
    final_text=' '.join(s)
    return final_text

In [16]:
df["CleanedText"]=df["text"].map(clean_text)

## Saving the preprocessed dataframe for future use

In [17]:
df.to_csv("Preprocessed.csv",index=False)

In [18]:
df.head()

### Shuffling the dataset.

In [19]:
df = df.sample(frac=1).reset_index(drop=True)

### Checking for missing values

In [20]:
df.isnull().sum()

## Doing EDA on data to discover the hidden trends

### Bar chart representing Sentiment Distribution in the dataset

In [20]:
target_cnt=df["sentiment"].value_counts()
target_cnt=target_cnt.to_dict()
plt.figure(figsize=(8,4))
plt.bar(target_cnt.keys(), target_cnt.values())
plt.title("Dataset labels distribuition")

### Creating a new column that will store the length of tweets

In [21]:
df.loc[:,"LengthOfTweets"]=df["CleanedText"].map(lambda x: len(x.split(' ')))
df.head()

### Ploting histogram for length of tweets

In [22]:
sns.histplot(data=df, x="LengthOfTweets",bins=10)

#### This is left skewed

### Ploting histogram of length of tweets for each unique sentiment

In [23]:
dfNegative=df[df.sentiment=="negative"]
dfPositive=df[df.sentiment=="positive"]
plt.figure(figsize=(18,4))
plt.subplot(1,2,1)
plt.title("Distribution of Length of negtive tweets")
sns.histplot(data=dfNegative, x="LengthOfTweets",bins=10)
plt.subplot(1,2,2)
plt.title("Distribution of Length of positive tweets")
sns.histplot(data=dfPositive, x="LengthOfTweets",bins=10)
plt.show()

In [24]:
print("Negative Sentiment Tweets:")
print(dfNegative["LengthOfTweets"].describe())
print()
print("Positive Sentiment Tweets:")
print(dfPositive["LengthOfTweets"].describe())
print()
print("ALL Tweets:")
print(df["LengthOfTweets"].describe())

In [25]:
NegativeTweets={"Minimum Length": np.min(dfNegative["LengthOfTweets"]),"Maximum Length": np.max(dfNegative["LengthOfTweets"])
               ,"Mean Length": np.mean(dfNegative["LengthOfTweets"])}
PositiveTweets={"Minimum Length": np.min(dfPositive["LengthOfTweets"]),"Maximum Length": np.max(dfPositive["LengthOfTweets"])
               ,"Mean Length": np.mean(dfPositive["LengthOfTweets"])}
# print()
plt.figure(figsize=(18,4))
plt.subplot(1,2,1)
plt.title("Distribution of Length of negtive tweets")
plt.bar(NegativeTweets.keys(),NegativeTweets.values())
plt.subplot(1,2,2)
plt.title("Distribution of Length of positive tweets")
plt.bar(PositiveTweets.keys(),PositiveTweets.values())
plt.show()

### Box plot of length of tweets

In [26]:
plt.figure(figsize=(10,4))
plt.boxplot(df["LengthOfTweets"])
plt.show()

## Model Training

### Traing LSTM network

In [27]:
tokenizer=Tokenizer(oov_token='<UNk>',num_words=30000)
tokenizer.fit_on_texts(df["CleanedText"])

In [28]:
maxlen=45           
vocab_size=30000    
maxlen,vocab_size

In [29]:
def padding_seq(tokenizer,seq):
    seq=tokenizer.texts_to_sequences(seq)
    padded_seq=pad_sequences(seq,truncating='post',padding='post',maxlen=maxlen)
    return padded_seq

In [30]:
X=df["CleanedText"]
Y=df["target"]
X=padding_seq(tokenizer,X)

In [31]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.10, random_state=42,stratify=Y)

In [32]:
model=tf.keras.models.Sequential([
                                tf.keras.layers.Embedding(vocab_size,16,input_length=maxlen),
                                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20,return_sequences=True)),
                                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
                                tf.keras.layers.Dense(2,activation='softmax')
])


model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [33]:
model.summary()

In [34]:
h=model.fit(
    X_train,y_train,
    validation_data=(X_test,y_test),
    epochs=20,
    callbacks=[
               tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=3)
    ]
)

In [35]:
model.save("TwitterSentimentAnalysisLSTMNetwork.h5")
import pickle
filename = 'tokenizer.pkl'
outfile = open(filename,'wb')
pickle.dump(tokenizer,outfile)
outfile.close()

### Trying Tfidf with Logistic Regression

In [36]:
X=df["CleanedText"]
Y=df["target"]

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(min_df=3,  max_features=None, strip_accents='unicode',  
        analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,stop_words="english")

In [38]:
tfidf.fit(X)

In [39]:
X=tfidf.transform(X)
X.shape

In [40]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.20, random_state=42,stratify=Y)

In [41]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(solver='liblinear')
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
print(sklearn.metrics.accuracy_score(y_test,y_pred))

In [42]:
import pickle
filename = 'tfidf.pkl'
outfile = open(filename,'wb')
pickle.dump(tfidf,outfile)
outfile.close()

filename = 'logisticRegression.pkl'
outfile = open(filename,'wb')
pickle.dump(lr,outfile)
outfile.close()

### Using pretrained Bert

In [21]:
!pip install tensorflow_text

In [22]:
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

In [23]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [24]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(2, activation='softmax', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [25]:
model.summary()

In [26]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss='sparse_categorical_crossentropy',
              metrics='accuracy')

#### reducing the training size for faster trainging

In [28]:
dfNegative=df[df.target==0][:200]
dfPositive=df[df.target==1][:200]
df_new=pd.concat([dfNegative,dfPositive])
df_new = df_new.sample(frac=1).reset_index(drop=True)

In [29]:
X=df_new["CleanedText"].values
Y=df_new["target"].values

In [30]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, Y, test_size=0.10, random_state=42
                                                                            ,stratify=Y)

In [31]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=3)

In [32]:
model.save("Bert.h5")