# Twitter Analysis

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read Data for twitter sentiment analysis
df=pd.read_csv("Twitter_Data.csv")
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [3]:
X=df.iloc[:,0] # independent variable
y=df.iloc[:,1] # dependent variable

In [4]:
# Change our dependent variable to categorical. ( 0 to “Neutral,” -1 to “Negative”, 1 to “Positive”)
df['category']=df['category'].map({0.0:'neutral',-1.0:'negative',1.0:'positive'})

In [5]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,negative
1,talk all the nonsense and continue all the dra...,neutral
2,what did just say vote for modi welcome bjp t...,positive
3,asking his supporters prefix chowkidar their n...,positive
4,answer who among these the most powerful world...,positive


In [6]:
# Do Missing value analysis and drop all null/missing values
df.isnull().sum()

clean_text    4
category      7
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

clean_text    0
category      0
dtype: int64

In [9]:
# Do text cleaning. (remove every symbol except alphanumeric, transform all words to lower case, and remove punctuation and stopwords )
df['clean_text']=df['clean_text'].str.lower()

In [10]:
# remove every symbol except alphanumeric
df.clean_text.str.replace('[^a-zA-Z]', ' ')

  df.clean_text.str.replace('[^a-zA-Z]', ' ')


0         when modi promised  minimum government maximum...
1         talk all the nonsense and continue all the dra...
2         what did just say vote for modi  welcome bjp t...
3         asking his supporters prefix chowkidar their n...
4         answer who among these the most powerful world...
                                ...                        
162975    why these     crores paid neerav modi not reco...
162976    dear rss terrorist payal gawar what about modi...
162977    did you cover her interaction forum where she ...
162978    there big project came into india modi dream p...
162979    have you ever listen about like gurukul where ...
Name: clean_text, Length: 162969, dtype: object

In [11]:
# remove punctuation
df['clean_text'] = df['clean_text'].str.replace(r'[^\w\s]+', '')

  df['clean_text'] = df['clean_text'].str.replace(r'[^\w\s]+', '')


In [12]:
sentences=df['clean_text']

In [13]:
#  remove stopwords
import nltk
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
df['clean_text'] = df['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [14]:
# Create a new column and find the length of each sentence (how many words they contain)
# sentences=df['clean_text']
df['length'] = df['clean_text'].apply(
    lambda row: min(len(row.split(" ")), len(row)) if isinstance(row, str) else None
)

In [15]:
df

Unnamed: 0,clean_text,category,length
0,modi promised minimum government maximum gover...,negative,21
1,talk nonsense continue drama vote modi,neutral,6
2,say vote modi welcome bjp told rahul main camp...,positive,13
3,asking supporters prefix chowkidar names modi ...,positive,19
4,answer among powerful world leader today trump...,positive,10
...,...,...,...
162975,456 crores paid neerav modi recovered congress...,negative,13
162976,dear rss terrorist payal gawar modi killing 10...,negative,32
162977,cover interaction forum left,neutral,4
162978,big project came india modi dream project happ...,neutral,9


In [16]:
# Split data into dependent(X) and independent(y) dataframe
X=df.iloc[:,0] # independent variable
y=df.iloc[:,1] # dependent variable

In [17]:
### Vocabulary size
voc_size=10000

In [18]:
from tensorflow.keras.preprocessing.text import one_hot
onehot_text=[one_hot(words,voc_size)for words in sentences] 
onehot_text

[[1482,
  8770,
  1594,
  2145,
  1475,
  2459,
  2399,
  9580,
  9556,
  875,
  9471,
  4340,
  2765,
  8507,
  9471,
  974,
  9748,
  1243,
  5618,
  6634,
  4715,
  534,
  974,
  2096,
  8823,
  8055,
  5376,
  8823,
  2096,
  3957,
  1992,
  8823,
  1150],
 [6760,
  9894,
  9471,
  1439,
  8823,
  3834,
  9894,
  9471,
  1091,
  9940,
  9914,
  2244,
  8770],
 [7032,
  9505,
  8533,
  9673,
  9914,
  2244,
  8770,
  7882,
  2391,
  582,
  2691,
  6141,
  9471,
  5449,
  4978,
  2244,
  8770,
  692,
  8770,
  2096,
  8533,
  2175],
 [3687,
  7719,
  5428,
  6620,
  7263,
  8601,
  1096,
  8770,
  9505,
  8047,
  1080,
  5738,
  638,
  835,
  7032,
  3728,
  7032,
  8055,
  5738,
  1415,
  6199,
  7032,
  9940,
  6553,
  3847,
  6648,
  2472,
  2020,
  6372,
  5230,
  4359,
  4447,
  1010,
  7085],
 [6717,
  5919,
  6555,
  9691,
  9471,
  6372,
  3704,
  1328,
  5833,
  3774,
  9126,
  4056,
  8770,
  3767],
 [2535, 7145, 8188, 4008, 22, 3692],
 [4382, 7041, 6253, 1154, 4085, 885, 5

In [19]:
# Add padding from the front side (use Tensorflow)
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentence_len=30
docs=pad_sequences(onehot_text,padding='pre',maxlen=sentence_len) # add padding from front 
print(docs)

[[2145 1475 2459 ... 1992 8823 1150]
 [   0    0    0 ... 9914 2244 8770]
 [   0    0    0 ... 2096 8533 2175]
 ...
 [   0    0    0 ... 5052 9637 6361]
 [   0    0    0 ... 8055 2134 3560]
 [1316 5348 6779 ... 4764 4153 2244]]


In [20]:
#  Build an LSTM model and compile it (describe features, input length, vocabulary size, information drop-out layer, activation function for output, )
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout


In [21]:
embedding_vector_features=30
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sentence_len))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.3))
model.add(Dense(3,activation='softmax'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 30)            300000    
                                                                 
 bidirectional (Bidirectiona  (None, 200)              104800    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense (Dense)               (None, 3)                 603       
                                                                 
Total params: 405,403
Trainable params: 405,403
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
#  Do dummy variable creation for the dependent variable
dummy=pd.get_dummies(y)

In [23]:
dummy

Unnamed: 0,negative,neutral,positive
0,1,0,0
1,0,1,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
162975,1,0,0
162976,1,0,0
162977,0,1,0
162978,0,1,0


In [24]:
# split the data into tests and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(docs, dummy, test_size=0.2, random_state=42)

In [25]:
# train  model
model.fit(X_train,y_train, validation_data=(X_test,y_test),epochs=10,batch_size=34)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c938278b50>

In [26]:
y_pred = model.predict(X_test)



In [None]:
# Measure performance metrics and accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

# classification report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))