In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
phrases = ["The quick brown fox jumped over the lagy dog","education is what you have left over after forgetting everything you have before"]

In [3]:
vect = CountVectorizer()
vect.fit(phrases)

CountVectorizer()

In [4]:
print("Vocabulary Size:{}".format(len(vect.vocabulary_)))
print("Vocabulary Content:\n {}".format(vect.vocabulary_))

Vocabulary Size:18
Vocabulary Content:
 {'the': 15, 'quick': 14, 'brown': 2, 'fox': 7, 'jumped': 10, 'over': 13, 'lagy': 11, 'dog': 3, 'education': 4, 'is': 9, 'what': 16, 'you': 17, 'have': 8, 'left': 12, 'after': 0, 'forgetting': 6, 'everything': 5, 'before': 1}


In [5]:
bag_of_words = vect.transform(phrases)
print(bag_of_words)

  (0, 2)	1
  (0, 3)	1
  (0, 7)	1
  (0, 10)	1
  (0, 11)	1
  (0, 13)	1
  (0, 14)	1
  (0, 15)	2
  (1, 0)	1
  (1, 1)	1
  (1, 4)	1
  (1, 5)	1
  (1, 6)	1
  (1, 8)	2
  (1, 9)	1
  (1, 12)	1
  (1, 13)	1
  (1, 16)	1
  (1, 17)	2


In [6]:
print("bag_of_words as an array:\n{}".format(bag_of_words.toarray()))

bag_of_words as an array:
[[0 0 1 1 0 0 0 1 0 0 1 1 0 1 1 2 0 0]
 [1 1 0 0 1 1 1 0 2 1 0 0 1 1 0 0 1 2]]


In [7]:
vect.get_feature_names()



['after',
 'before',
 'brown',
 'dog',
 'education',
 'everything',
 'forgetting',
 'fox',
 'have',
 'is',
 'jumped',
 'lagy',
 'left',
 'over',
 'quick',
 'the',
 'what',
 'you']

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
data = pd.read_csv("/content/drive/MyDrive/Thesis_Coding/Dataset1_after_augmentation.csv")

In [11]:
data.head()

Unnamed: 0,Label,Text
0,1,nen á vist bolest vztek smutek ज़मातेक ोसम ě lo...
1,1,हाँ यार नेहा कब करेगा वह पोस्ट उसने न सच में p...
2,0,television media congress के लिए नही ह . ये तो...
3,2,आल इंडिया me ंर्क लागु करे w कश्मीर से dhara 3...
4,1,who पागल है क्या ? They aren ’ t real issues M...


In [12]:
print("Samples per class: {}".format(np.bincount(data.Label)))

Samples per class: [11863 15110 13026]


In [13]:
def simple_split(data, y, length, split_mark=0.7):
    if split_mark > 0. and split_mark < 1.0:
        n = int(split_mark*length)
    else:
        n = int(split_mark)
    X_train = data[:n].copy()
    X_test = data[n:].copy()
    y_train =y[:n].copy()
    y_test = y[n:].copy()
    return X_train, X_test, y_train, y_test

In [14]:
vectorizer = CountVectorizer()

In [15]:
X_train, X_test, y_train, y_test = simple_split(data.Text, data.Label, len(data))
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(27999,) (12000,) (27999,) (12000,)


In [16]:
print("Samples per class: {}".format(np.bincount(y_train)))
print("Samples per class: {}".format(np.bincount(y_test)))

Samples per class: [ 8258 10682  9059]
Samples per class: [3605 4428 3967]


In [17]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [18]:
feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 19500 to 19530:\n{}".format(feature_names[17000:17730]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of features: 17734
First 20 features:
['0007', '000per', '001', '003', '004', '007', '008', '009', '01', '0103', '017', '02', '0281', '03', '03013461454', '0336', '035539', '05', '06', '0600am']
Features 19500 to 19530:
['लअनत', 'लईए', 'लए', 'लएब', 'लओग', 'लक', 'लकक', 'लकण', 'लकत', 'लकन', 'लकम', 'लकर', 'लकव', 'लख', 'लखत', 'लखन', 'लखनऊ', 'लखनप', 'लग', 'लगई', 'लगक', 'लगगए', 'लगत', 'लगन', 'लगभग', 'लगय', 'लगव', 'लघ', 'लघन', 'लच', 'लचए', 'लचर', 'लज', 'लझ', 'लट', 'लटक', 'लठ', 'लड', 'लडक', 'लढ', 'लण', 'लत', 'लतख', 'लतमक', 'लद', 'लदन', 'लदय', 'लदव', 'लध', 'लन', 'लप', 'लपट', 'लपनस', 'लपल', 'लफ', 'लफल', 'लफड़', 'लब', 'लबदन', 'लबर', 'लभ', 'लम', 'लमत', 'लमस', 'लय', 'लयव', 'लर', 'लरए', 'लरक', 'लरकय', 'लरत', 'लरन', 'लरह', 'लल', 'ललक', 'ललथ', 'ललयद', 'लव', 'लवक', 'लवकत', 'लवद', 'लवर', 'लवल', 'लवज़', 'लश', 'लशन', 'लष', 'लस', 'लसप', 'लसर', 'लसससससस', 'लह', 'लहज', 'लहत', 'लहर', 'लहरब', 'लहक़', 'लक़', 'लक़ब', 'लड़', 'लड़क', 'लड़कर', 'लड़त', 'लड़न', 'लड़ल', 'लफ़', 'लफ़ज़', 'ल०', 'ल२', 'ळक', 'वअव', 



In [19]:
vectorizer.vocabulary_

{'nen': 9181,
 'vist': 13960,
 'bolest': 2274,
 'vztek': 14025,
 'smutek': 12251,
 'ज़म': 17589,
 'सम': 17325,
 'lost': 8036,
 'beznad': 2014,
 'nakonec': 9049,
 'कल': 15440,
 'अस': 14772,
 'तखल': 15943,
 'पद': 16312,
 'life': 7873,
 'कब': 15383,
 'कर': 15406,
 'वह': 17173,
 'उसन': 15138,
 'सच': 17267,
 'photoshoot': 10009,
 'करन': 15420,
 'television': 13057,
 'media': 8495,
 'congress': 3283,
 'नह': 16240,
 'आपक': 14878,
 'पत': 16303,
 'चल': 15661,
 'गय': 15559,
 'अच': 14652,
 'ke': 7229,
 'आल': 14915,
 'me': 8478,
 'कश': 15455,
 'dhara': 3907,
 '370ko': 378,
 'ख़तम': 17562,
 'हम': 17465,
 'यन': 16836,
 'आपस': 14884,
 'यह': 16871,
 'उम': 15125,
 'who': 14248,
 'गल': 15572,
 'they': 13166,
 'aren': 1330,
 'real': 10865,
 'issues': 6756,
 'mandir': 8280,
 'is': 6709,
 'important': 6449,
 'hindu': 6083,
 'खतर': 15487,
 'मन': 16728,
 'आश': 14924,
 'करत': 15414,
 'तरह': 16003,
 'जनत': 15736,
 'jant': 6858,
 'but': 2536,
 'तउ': 15933,
 'new': 9204,
 'job': 6971,
 'गई': 15515,
 'ye': 14450,

In [20]:
i = 10500
j = 10
words = vectorizer.get_feature_names()[i:i+10]
pd.DataFrame(X_train[j:j+7, i:i+10].todense(), columns=words)

Unnamed: 0,protection,protective,protein,proterm,protest,protestant,prothom,protocol,prou,proud
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0


In [21]:
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Mean cross-validation accuracy: 0.80


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score:{:.3f}".format(logreg.score(X_test, y_test)))

Training set score: 0.910
Test set score:0.854


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [23]:
pred_logreg = logreg.predict(X_test)
confusion = confusion_matrix(y_test, pred_logreg)
print("Confudsion Matrix:\n{}".format(confusion))

Confudsion Matrix:
[[3112  380  113]
 [ 320 3756  352]
 [ 170  413 3384]]


In [24]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
print("Training set score: {:.3f}".format(nb.score(X_train, y_train)))
print("Test set score: {:.3f}".format(nb.score(X_test, y_test)))


Training set score: 0.761
Test set score: 0.713


In [25]:
pred_nb = nb.predict(X_test)
confusion = confusion_matrix(y_test, pred_nb)
print("Confusion Matrix:\n{}".format(confusion))

Confusion Matrix:
[[3077  313  215]
 [1118 2507  803]
 [ 586  413 2968]]


In [26]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print("Training set score: {:.3f}".format(rf.score(X_train, y_train)))
print("Test set score: {:.3f}".format(rf.score(X_test, y_test)))

Training set score: 0.994
Test set score: 0.991
