**Importing the libraries and dataset**

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd



In [2]:
pip install tensorflow_text

Note: you may need to restart the kernel to use updated packages.


In [3]:
df = pd.read_csv("/kaggle/input/twitter/Twitter_Data.csv")

In [4]:
df.head(10)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
5,kiya tho refresh maarkefir comment karo,0.0
6,surat women perform yagna seeks divine grace f...,0.0
7,this comes from cabinet which has scholars lik...,0.0
8,with upcoming election india saga going import...,1.0
9,gandhi was gay does modi,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [6]:
df['category'].value_counts()

category
 1.0    72250
 0.0    55213
-1.0    35510
Name: count, dtype: int64

**Downsampling for making the data set balanced**

Though *downsample* method of sampling is not recomended for such type of task because while downsampling few data get deleted which causes the problem of infomation loss. I used this technique because of simplicity and just for self learning process.

Note: Always try oversampling techique, as such technique doesnot lead towards information loss.

In [7]:
df_pos = df[df['category'] ==1]

In [8]:
df_neg = df[df['category'] == -1]

In [9]:
df_neut = df[df['category'] == -1]

In [10]:
df_pos_downsample = df_pos.sample(df_neg.shape[0])

In [11]:
df_bal = pd.concat([df_pos_downsample, df_neg])

In [12]:
df_bal

Unnamed: 0,clean_text,category
98258,clear indias launch mayve been aimed target sa...,1.0
72995,these babus just know sit their office and eat...,1.0
107375,why ppl are afraid modi has done good job for ...,1.0
69915,twitterati went into tizzy today for hour when...,1.0
4485,pappu khan will vote modi not you not waste yo...,1.0
...,...,...
162956,when narender modi contested two seats was not...,-1.0
162962,modi decimates congress front crores congress ...,-1.0
162967,azamgarh please poor working requirement jogia...,-1.0
162975,why these 456 crores paid neerav modi not reco...,-1.0


In [13]:
df_bal.shape

(71020, 2)

In [14]:
df_bal['category'].value_counts()

category
 1.0    35510
-1.0    35510
Name: count, dtype: int64

In [15]:
df_bal['class'] = df_bal['category'].apply(lambda x: 1 if x== -1 else 0 )

In [16]:
df_bal.head(10)

Unnamed: 0,clean_text,category,class
98258,clear indias launch mayve been aimed target sa...,1.0,0
72995,these babus just know sit their office and eat...,1.0,0
107375,why ppl are afraid modi has done good job for ...,1.0,0
69915,twitterati went into tizzy today for hour when...,1.0,0
4485,pappu khan will vote modi not you not waste yo...,1.0,0
92375,lol already did were indra gandhinyay narendra...,1.0,0
48089,became possible because modi government tried...,1.0,0
52784,today every indian saying main bhi chowkidar m...,1.0,0
45725,when modi mentioned needs address the nation i...,1.0,0
162027,let there any propoganda twitter let media muc...,1.0,0


In [17]:
df_bal.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71020 entries, 98258 to 162976
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   clean_text  71019 non-null  object 
 1   category    71020 non-null  float64
 2   class       71020 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 2.2+ MB


In [18]:
df_bal.sample(10)

Unnamed: 0,clean_text,category,class
57946,policy maker itself policy breaker indian econ...,-1.0,1
114737,modi has decieved the whole nation,1.0,0
120481,modi\nbut was station\nlook bro where reached ...,-1.0,1
10701,quite strange know modi doing devlopment work ...,1.0,0
131751,remember this election against modi against ep...,1.0,0
95398,good you feed with such things too betweendnt ...,1.0,0
123531,karnataka state government recruiting non kann...,-1.0,1
152254,anti modi twitter gyanis are more concerned ab...,1.0,0
28206,news election tracker live congress chief rahu...,-1.0,1
68268,parking money for space exploration encouragem...,1.0,0


In [19]:
df_bal = df_bal.dropna()

In [20]:
df_bal.isna().sum()

clean_text    0
category      0
class         0
dtype: int64

**Splitting the dataset for modelling**

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_bal['clean_text'],df_bal['class'], stratify=df_bal['class'])

**Dowloading the trained for preprocessing and BERT based for encoding**

In [22]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

**Buliding the layers for neural network**

In [23]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

In [24]:
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

In [25]:
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [26]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

**Training the model**

In [27]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=METRICS)

In [28]:
model.fit(X_train, y_train, epochs=10, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7b7b36768310>

In [29]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()

