In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
dataset = pd.read_pickle('submissions_df_clean.pkl')

In [8]:
dataset = dataset[['flair','title_processed','comments_processed','selftext_processed']]

In [9]:
dataset['text'] = 3*dataset['title_processed']+2*dataset['selftext_processed']+dataset['comments_processed']
dataset = dataset[['flair','text']]

In [10]:
dataset = dataset.assign(**pd.get_dummies(dataset['flair']))

In [11]:
top_flairs = pd.read_pickle('top_flairs.pkl')

In [12]:
flairs = top_flairs.index.to_list()
flairs

['Non-Political',
 'Politics',
 'Coronavirus',
 'AskIndia',
 'Policy/Economy',
 'Business/Finance',
 'Photography',
 '[R]eddiquette',
 'Sports',
 'Science/Technology',
 'Others']

In [13]:
dataset = dataset[['flair','text']+flairs[:-1]].assign(Others=dataset[dataset.columns.difference(flairs[:-1])].max(1))
dataset['text'] = dataset['text'].apply(lambda x: ' '.join([str(elem) for elem in x]))
dataset['text'][0]

"coronavirus covid-19 megathread news update 4 coronavirus covid-19 megathread news update 4 coronavirus covid-19 megathread news update 4 covid-19 fundraiser donation link amnesty international link cover migrant worker day-labourers vulnerable group urban poor transgender community waste-pickers sanitation worker healthcare worker doctor older person child animal care -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- indian goverment official twitter collection indian govt communication state district wise detail case india india helplines 1075 toll free 1930 toll free 1944 northeast india +911123978046 email id ncov2019 gov.in state helpline number test center list state specific thread reddit community andaman amp nicobar lakshadweep puducherry dadra amp nagar haveli daman amp diu you/thedosaman bihar you/filmmakerfarhan delhi gujarat you/helvetikka karnataka you/theclassicgallery

In [14]:
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset.loc[:, ~dataset.columns.isin(['flair', 'text'])], test_size=0.20, random_state=42)

In [38]:
y_train

Unnamed: 0,Non-Political,Politics,Coronavirus,AskIndia,Policy/Economy,Business/Finance,Photography,[R]eddiquette,Sports,Science/Technology,Others
561,0,0,0,0,0,0,1,0,0,0,0
43,0,0,0,0,1,0,0,0,0,0,0
100,0,0,0,0,0,1,0,0,0,0,0
274,1,0,0,0,0,0,0,0,0,0,0
1265,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1130,0,1,0,0,0,0,0,0,0,0,0
1294,0,1,0,0,0,0,0,0,0,0,0
860,0,1,0,0,0,0,0,0,0,0,0
1459,0,0,0,0,1,0,0,0,0,0,0


In [16]:
X_test

482     mumbai iconic ramzan food market menu time 250...
1505    proof india self-driving car technology tesla ...
950     social distance vegetable market vijayawada an...
1005    catch itôçös late catch itôçös late catch itôç...
705     migrant worker migrant worker migrant worker a...
                              ...                        
584     tvf panchayat authentic rural india portrayal ...
310     world learn kerala fight covid-19 world learn ...
56      karnataka government decide partially ease loc...
513     look get health insurance look get health insu...
366     til oscar win song 'jai ho originally compose ...
Name: text, Length: 348, dtype: object

In [17]:
%matplotlib inline
import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

In [13]:
# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
for flair in flairs:
    print('... Processing {}'.format(flair))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, y_train[flair])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[flair], prediction)))

... Processing Non-Political
Test accuracy is 0.6609195402298851
... Processing Politics
Test accuracy is 0.8103448275862069
... Processing Coronavirus
Test accuracy is 0.8017241379310345
... Processing AskIndia
Test accuracy is 0.9339080459770115
... Processing Policy/Economy
Test accuracy is 0.9511494252873564
... Processing Business/Finance
Test accuracy is 0.9770114942528736
... Processing Photography
Test accuracy is 0.9626436781609196
... Processing [R]eddiquette
Test accuracy is 0.9885057471264368
... Processing Sports
Test accuracy is 0.9885057471264368
... Processing Science/Technology
Test accuracy is 0.9885057471264368
... Processing Others
Test accuracy is 0.9396551724137931


In [18]:
RF_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)), 
                        ('clf', RandomForestClassifier())])

In [19]:
RF_pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all'...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impu

In [20]:
accuracy_score(y_test, RF_pipeline.predict(X_test))

0.27011494252873564

In [21]:
f1_score(y_test, RF_pipeline.predict(X_test), average='micro')

0.3908523908523908

In [22]:
f1_score(y_test, RF_pipeline.predict(X_test), average='weighted')

  'precision', 'predicted', average, warn_for)


0.35337271952388705

In [23]:
LSVC_pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)), 
                        ('clf', LinearSVC())])

In [24]:
LSVC_pipeline.fit(X_train, np.argmax(np.array(y_train), axis=1))

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words={'a', 'about', 'above', 'after',
                                             'again', 'against', 'ain', 'all'...
                                             'couldn', "couldn't", ...},
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabular

In [25]:
accuracy_score(np.argmax(np.array(y_test), axis=1), LSVC_pipeline.predict(X_test))

0.6551724137931034

In [26]:
f1_score(np.argmax(np.array(y_test), axis=1), LSVC_pipeline.predict(X_test), average='micro')

0.6551724137931034

In [27]:
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Dense, Embedding, Input, LSTM
from keras.optimizers import Adam, SGD
from keras.callbacks import EarlyStopping

In [28]:
import tensorflow as tf

TypeError: 'Tokenizer' object is not callable

In [39]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dataset['text'])
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [53]:
tokenizer.texts_to_sequences( ['hello', 'r/india', 'n', 'sacrifice', 'life', 'work', 'witnessed', 'life', 'get', 'destroyed'])
tokenizer.pkl

[[1186], [19, 2], [1077], [2146], [40], [15], [32947], [40], [11], [4198]]

In [30]:
X_train_2 = tokenizer.texts_to_sequences(X_train)
X_test_2 = tokenizer.texts_to_sequences(X_test)

In [56]:
X_test

482     mumbai iconic ramzan food market menu time 250...
1505    proof india self-driving car technology tesla ...
950     social distance vegetable market vijayawada an...
1005    catch itôçös late catch itôçös late catch itôç...
705     migrant worker migrant worker migrant worker a...
                              ...                        
584     tvf panchayat authentic rural india portrayal ...
310     world learn kerala fight covid-19 world learn ...
56      karnataka government decide partially ease loc...
513     look get health insurance look get health insu...
366     til oscar win song 'jai ho originally compose ...
Name: text, Length: 348, dtype: object

In [37]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(text_y_train)
text_y_train_enc = encoder.transform(text_y_train)
text_y_test_enc = encoder.transform(text_y_test)

NameError: name 'text_y_train' is not defined

In [32]:
X_train_sequence = sequence.pad_sequences(X_train_2, maxlen=80)
X_test_sequence = sequence.pad_sequences(X_test_2, maxlen=80)

In [57]:
X_test_sequence

array([[    0,     0,     0, ...,  1336,  4728,   227],
       [   18,    23,  1634, ...,   540,   429,    86],
       [ 5260,   569,   153, ...,    18,  4134,   226],
       ...,
       [   38,   231,  6730, ...,   298,   125,  7303],
       [  770,     3, 21282, ...,    18,  2052,   715],
       [  522, 20865,  1012, ...,    85,   522,  1436]])

In [33]:
len(tokenizer.word_index)

38028

In [34]:
embedding_length = 200
model = Sequential()
model.add(Embedding( len(tokenizer.word_index)+1, embedding_length ,input_length = X_train_sequence.shape[1]))
model.add(LSTM(embedding_length, dropout=0.2))
model.add(Dense(11,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [35]:
model.fit(X_train_sequence, np.array(y_train), batch_size=64,epochs=25,
          validation_data=(X_test_sequence, np.array(y_test)))


Train on 1389 samples, validate on 348 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.callbacks.History at 0x1e87781ea08>

In [36]:
model.save('model_lstm.h5')

In [40]:
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=10)

In [41]:
model.fit(X_train_sequence, np.array(y_train), batch_size=64,epochs=50,
          validation_data=(X_test_sequence, np.array(y_test)), callbacks=[es])

Train on 1389 samples, validate on 348 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 00015: early stopping


<keras.callbacks.callbacks.History at 0x1e8054fe148>

In [42]:
test = ['Non-Political',
 'Politics',
 'Coronavirus',
 'AskIndia',
 'Policy/Economy',
 'Business/Finance',
 'Photography',
 '[R]eddiquette',
 'Sports',
 'Science/Technology',
 'Others']

In [44]:
test[0]

'Non-Political'