from __future__ import print_function, division
from builtins import range

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MAX_SEQUENCE_LENGTH= 100
MAX_VOCAB_SIZE =20000
EMBEDDING_DIM=100
VALIDATION_SPLIT =0.2
BATCH_SIZE =128
EPOCHS =10

In [None]:
#importing word2vec and making a dictionary

In [None]:
word2vec = {}
with open (os.path.join("glove.6B.%sd.txt" % EMBEDDING_DIM), encoding='utf-8')as f:
    for line in f:
        values=line.split()
        word=values[0]
        vec = np.asarray(values[1:], dtype="float32")
        word2vec[word] =vec      

TypeError: ignored

In [None]:
len(word2vec)

400000

In [None]:
#the train Dataset

In [None]:
train = pd.read_csv("Dataset1.csv")

In [None]:
train["Text"].isna().sum()

0

In [None]:
train.head()

Unnamed: 0,Label,Text
0,1,nen á vist bolest vztek smutek ज़मातेक ोसम ě lo...
1,1,हाँ यार नेहा कब करेगा वह पोस्ट उसने न सच में p...
2,0,television media congress के लिए नही ह . ये तो...
3,2,आल इंडिया me ंर्क लागु करे w कश्मीर से dhara 3...
4,1,who पागल है क्या ? They aren ’ t real issues M...


In [None]:
sentences = train["Text"].fillna("NO_COMMENT_EMPTY").values

In [None]:
sentences

array(['nen á vist bolest vztek smutek ज़मातेक ोसम ě lost beznad ě ज a nakonec जेन कलिद असि तखले विपद á म ů j life ...',
       'हाँ यार नेहा कब करेगा वह पोस्ट उसने न सच में photoshoot करना चाहिए फिर वह पोस्ट करेगा  ',
       'television media congress के लिए नही ह . ये तोह आपको पता चल ही गया होगा . अच्छा होगा कि कांग्रेस ke  ',
       ...,
       'भारत माता की जय जय हिन्द जय भारत श्रीमाँ देश का एक नागरिक होने के साथ मेरे मन में एक विचार उत्पन्न हुआ h  ',
       'EVM के खिलाफ अब शाश्त्र आंदोलन के सिवाय और कोई चारा नहीं . 85% मूलवासी आंबेडकर तुम्हारे किसी काम नहीं आया  ',
       'RT ind Teacher - class मैं कोई टिफ़िन नहीं खायेगा . * पीछे सीट वाले हरामी बच्चे * Krishna बीएस '],
      dtype=object)

In [None]:
sentences.shape

(39999,)

In [None]:
possible_labels = ["Label"]

In [None]:
sentences[1]

'हाँ यार नेहा कब करेगा वह पोस्ट उसने न सच में photoshoot करना चाहिए फिर वह पोस्ट करेगा  '

In [None]:
targets = train[possible_labels].values

In [None]:
targets

array([[1],
       [1],
       [0],
       ...,
       [1],
       [0],
       [1]], dtype=int64)

In [None]:
train

Unnamed: 0,Label,Text
0,1,nen á vist bolest vztek smutek ज़मातेक ोसम ě lo...
1,1,हाँ यार नेहा कब करेगा वह पोस्ट उसने न सच में p...
2,0,television media congress के लिए नही ह . ये तो...
3,2,आल इंडिया me ंर्क लागु करे w कश्मीर से dhara 3...
4,1,who पागल है क्या ? They aren ’ t real issues M...
...,...,...
39994,0,उसकी फ़िक्र चोर क जो आपकी मुल्क में छोटी बचिओ k...
39995,2,रत मेरे सोहने मुर्शिद Ji ਹੁਣ ਤੁਸੀਂ ਨਹੀਂ ਕਰੋਗੇ ...
39996,1,भारत माता की जय जय हिन्द जय भारत श्रीमाँ देश क...
39997,0,EVM के खिलाफ अब शाश्त्र आंदोलन के सिवाय और कोई...


In [None]:
#preprocessing  and tokenizing

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer= Tokenizer(num_words=MAX_VOCAB_SIZE)

In [None]:
tokenizer.fit_on_texts(sentences)

In [None]:
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
sequences

[[2074, 14242, 784, 14242, 957, 23, 9626, 7593, 2074, 157, 8560, 669, 285],
 [745,
  463,
  8561,
  469,
  613,
  191,
  1265,
  2133,
  31,
  560,
  6,
  260,
  195,
  132,
  191,
  1265,
  613],
 [8562,
  216,
  113,
  5,
  64,
  67,
  52,
  16,
  105,
  93,
  169,
  385,
  19,
  72,
  154,
  276,
  154,
  145,
  219,
  299],
 [381,
  170,
  76,
  4175,
  330,
  861,
  1033,
  9,
  10976,
  1982,
  330,
  71,
  1521,
  4,
  443,
  250,
  375,
  1],
 [308,
  829,
  1,
  45,
  245,
  9627,
  41,
  196,
  710,
  3995,
  14243,
  18,
  1983,
  870,
  2514,
  6,
  30,
  1],
 [141,
  2,
  958,
  692,
  1358,
  4176,
  33,
  17,
  1286,
  210,
  331,
  10977,
  46,
  2,
  254,
  28,
  3801,
  29,
  1522,
  2,
  10978],
 [130, 3651, 148, 2187, 2, 243, 597, 535, 6960, 2657, 350, 950, 431],
 [303, 33, 216, 547, 4, 15, 1523, 594, 1, 124, 224, 5, 1545, 129, 6],
 [134,
  45,
  5136,
  1,
  215,
  281,
  2188,
  1335,
  147,
  31,
  622,
  102,
  75,
  1521,
  15,
  3,
  119],
 [1002, 27, 1597, 79

In [None]:
tokenizer.word_index

{'है': 1,
 'की': 2,
 'नहीं': 3,
 'को': 4,
 'के': 5,
 'में': 6,
 'का': 7,
 'rt': 8,
 'से': 9,
 'तो': 10,
 'to': 11,
 'और': 12,
 'हो': 13,
 'the': 14,
 'भी': 15,
 'ये': 16,
 'जी': 17,
 'is': 18,
 'ही': 19,
 'of': 20,
 'क': 21,
 'you': 22,
 'a': 23,
 'and': 24,
 'आप': 25,
 'for': 26,
 'i': 27,
 'ने': 28,
 'कर': 29,
 'जो': 30,
 'न': 31,
 'कोई': 32,
 'मोदी': 33,
 'इस': 34,
 'with': 35,
 'in': 36,
 'हैं': 37,
 'बहुत': 38,
 'एक': 39,
 'ी': 40,
 '’': 41,
 'this': 42,
 'my': 43,
 'अब': 44,
 'क्या': 45,
 'देश': 46,
 'face': 47,
 'सर': 48,
 'तुम': 49,
 'यू': 50,
 'पर': 51,
 'ह': 52,
 'था': 53,
 'on': 54,
 'all': 55,
 'लव': 56,
 'लोग': 57,
 'वो': 58,
 'it': 59,
 'कुछ': 60,
 'are': 61,
 'so': 62,
 'good': 63,
 'लिए': 64,
 'पे': 65,
 'जय': 66,
 'नही': 67,
 'हे': 68,
 'रहे': 69,
 'भाई': 70,
 'हम': 71,
 'गया': 72,
 'दिया': 73,
 'best': 74,
 'तू': 75,
 'me': 76,
 'रत': 77,
 'रहा': 78,
 'that': 79,
 'k': 80,
 'we': 81,
 'ु': 82,
 'your': 83,
 'bjp': 84,
 'यह': 85,
 'अपने': 86,
 'जैसे': 87,
 'माँ': 88,
 

In [None]:
print("max sequence length:", max(len(s) for s in sequences))
print("min sequence length:", min(len(s) for s in sequences))
s = sorted(len(s) for s in sequences)
print("median sequence length:", s[len(s) // 2])

print("max word index:", max(max(seq) for seq in sequences if len(seq) > 0))


# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

max sequence length: 119
min sequence length: 0
median sequence length: 17
max word index: 19999
Found 33063 unique tokens.


In [None]:
#preprocessing the data text

In [None]:
# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)

Shape of data tensor: (39999, 100)


In [None]:
data[51]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,  1073,  1073,  1073,
         114,  2311,     4,    19,   715,  1066,     2,    91,     5,
          46,     2,    91,     5,   347,     2,    91,     2, 10993,
           2])

In [None]:
#padding

In [None]:
# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
sequences[1]

[745,
 463,
 8561,
 469,
 613,
 191,
 1265,
 2133,
 31,
 560,
 6,
 260,
 195,
 132,
 191,
 1265,
 613]

In [None]:
print('Shape of data tensor:', data.shape)
data.shape


Shape of data tensor: (39999, 100)


(39999, 100)

In [None]:
#Creating the Embedding Martix

In [None]:
MAX_VOCAB_SIZE

20000

In [None]:
len(word2idx) + 1

33064

In [None]:
min(MAX_VOCAB_SIZE, len(word2idx) + 1)

20000

In [None]:
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)

In [None]:
EMBEDDING_DIM

100

In [None]:
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

In [None]:
embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
embedding_matrix.shape #V X D (Dimensions)

(20000, 100)

In [None]:
for word, i in word2idx.items():

In [None]:
word2vec.get(word)

array([ 0.28365  , -0.6263   , -0.44351  ,  0.2177   , -0.087421 ,
       -0.17062  ,  0.29266  , -0.024899 ,  0.26414  , -0.17023  ,
        0.25817  ,  0.097484 , -0.33103  , -0.43859  ,  0.0095799,
        0.095624 , -0.17777  ,  0.38886  ,  0.27151  ,  0.14742  ,
       -0.43973  , -0.26588  , -0.024271 ,  0.27186  , -0.36761  ,
       -0.24827  , -0.20815  ,  0.22128  , -0.044409 ,  0.021373 ,
        0.24594  ,  0.26143  ,  0.29303  ,  0.13281  ,  0.082232 ,
       -0.12869  ,  0.1622   , -0.22567  , -0.060348 ,  0.28703  ,
        0.11381  ,  0.34839  ,  0.3419   ,  0.36996  , -0.13592  ,
        0.0062694,  0.080317 ,  0.0036251,  0.43093  ,  0.01882  ,
        0.31008  ,  0.16722  ,  0.074112 , -0.37745  ,  0.47363  ,
        0.41284  ,  0.24471  ,  0.075965 , -0.51725  , -0.49481  ,
        0.526    , -0.074645 ,  0.41434  , -0.1956   , -0.16544  ,
       -0.045649 , -0.40153  , -0.13136  , -0.4672   ,  0.18825  ,
        0.2612   ,  0.16854  ,  0.22615  ,  0.62992  , -0.1288

In [None]:
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.069229  ,  0.59527999, -0.041912  , ...,  0.039572  ,
        -0.74014997,  0.011751  ],
       [-0.13083   ,  0.33579001,  0.0032469 , ...,  0.092978  ,
        -0.67706001, -0.27642   ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.7791    , -0.31294999, -0.114     , ...,  0.48513001,
         0.32253   ,  0.32424   ]])

In [None]:
embedding_matrix.shape

(20000, 100)

In [None]:
MAX_SEQUENCE_LENGTH= 100
MAX_VOCAB_SIZE =20000
EMBEDDING_DIM=100
VALIDATION_SPLIT =0.2
BATCH_SIZE =128
EPOCHS =10

In [None]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False

)

In [None]:
embedding_layer?

In [None]:
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,)) #Dimensions

In [None]:
x=embedding_layer(input_)

x=Conv1D(128,3,activation="relu")(x)
x=MaxPooling1D(3)(x)

x=Conv1D(128,3,activation="relu")(x)
x=MaxPooling1D(3)(x)

x=Conv1D(128,3,activation="relu")(x)
x=GlobalMaxPooling1D()(x)

x=Dense(128,activation="relu")(x)


In [None]:
output =Dense(len(possible_labels), activation="sigmoid")(x)

In [None]:
model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer='rmsprop',
  metrics=['accuracy']
)

In [None]:
data[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,  745,  463, 8561,  469,  613,
        191, 1265, 2133,   31,  560,    6,  260,  195,  132,  191, 1265,
        613])

In [None]:
targets

array([[1],
       [1],
       [0],
       ...,
       [1],
       [0],
       [1]], dtype=int64)

In [None]:
r= model.fit(
data,
    targets,
    batch_size=BATCH_SIZE,
    epochs=10,
    validation_split=VALIDATION_SPLIT,

)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
