## Loading packages

In [1]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('words')
from nltk.corpus import words
import re
import string
import datetime
from datetime import datetime
from dateutil.parser import parse

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Tanya\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Reading training data

In [2]:
train = pd.read_csv("en_train.csv")
train.shape

(9918441, 5)

In [3]:
train

Unnamed: 0,sentence_id,token_id,class,before,after
0,0,0,PLAIN,Brillantaisia,Brillantaisia
1,0,1,PLAIN,is,is
2,0,2,PLAIN,a,a
3,0,3,PLAIN,genus,genus
4,0,4,PLAIN,of,of
...,...,...,...,...,...
9918436,748065,12,PLAIN,of,of
9918437,748065,13,PLAIN,the,the
9918438,748065,14,ORDINAL,19th,nineteenth
9918439,748065,15,PLAIN,century,century


## Removing null values

In [3]:
train[train["before"].isnull() == True]

Unnamed: 0,sentence_id,token_id,class,before,after
584464,46722,18,PLAIN,,
616107,49226,17,LETTERS,,n a
684691,54634,1,PLAIN,,
747949,59607,5,PLAIN,,
965529,76612,7,PLAIN,,
...,...,...,...,...,...
9665672,729456,2,PLAIN,,
9665677,729456,7,PLAIN,,
9725113,733797,15,PLAIN,,
9864166,744065,15,PLAIN,,


In [4]:
train[train["after"].isnull() == True]

Unnamed: 0,sentence_id,token_id,class,before,after
584464,46722,18,PLAIN,,
684691,54634,1,PLAIN,,
747949,59607,5,PLAIN,,
965529,76612,7,PLAIN,,
1347924,106058,6,PLAIN,,
...,...,...,...,...,...
9665672,729456,2,PLAIN,,
9665677,729456,7,PLAIN,,
9725113,733797,15,PLAIN,,
9864166,744065,15,PLAIN,,


In [5]:
train.dropna(inplace=True)

In [6]:
train = train[train['before'].map(lambda x: x.isascii())]
train = train[train['after'].map(lambda x: x.isascii())]
train.shape
# remove non-English rows
# https://stackoverflow.com/questions/65012603/removing-rows-contains-non-english-words-in-pandas-dataframe

(9852216, 5)

## Reading test data

In [7]:
train[train["class"] == "DATE"]

Unnamed: 0,sentence_id,token_id,class,before,after
10,1,0,DATE,2006,two thousand six
51,3,7,DATE,2007,two thousand seven
80,5,0,DATE,2008,two thousand eight
111,8,1,DATE,4 March 2014,the fourth of march twenty fourteen
147,11,1,DATE,"April 10, 2013",april tenth twenty thirteen
...,...,...,...,...,...
9918259,748054,1,DATE,October 1865,october eighteen sixty five
9918267,748054,9,DATE,14 January 1867,the fourteenth of january eighteen sixty seven
9918276,748054,18,DATE,1866,eighteen sixty six
9918311,748056,8,DATE,1291,twelve ninety one


In [8]:
test = pd.read_csv("en_test.csv")
test

Unnamed: 0,sentence_id,token_id,before
0,0,0,Another
1,0,1,religious
2,0,2,family
3,0,3,is
4,0,4,of
...,...,...,...
1088559,69999,13,close
1088560,69999,14,to
1088561,69999,15,machine
1088562,69999,16,languages


In [9]:
test[test["before"].isnull() == True]

Unnamed: 0,sentence_id,token_id,before
83566,5382,3,
149180,9602,1,
261261,16795,8,
261266,16795,13,
310898,19971,8,
378899,24332,10,
421731,27088,2,
478343,30770,4,
652551,42023,11,
751492,48355,1,


In [10]:
test.dropna(inplace=True)

In [11]:
test["class"] = ""
test["after"] = ""

In [12]:
test=test.reindex(columns= ['sentence_id', 'token_id', 'class','before', 'after'])
test.head()
test.shape

(1088551, 5)

In [13]:
test = test[test['before'].map(lambda x: x.isascii())]
test.shape
# remove non-English rows
# https://stackoverflow.com/questions/65012603/removing-rows-contains-non-english-words-in-pandas-dataframe

(1081965, 5)

## Create Naive Bayes to classify `class`

In [14]:
X_train, X_test, y_train, y_test = train_test_split(train["before"], train["class"], test_size=0.3)

In [15]:
v = CountVectorizer(analyzer="char")
X_train_count = v.fit_transform(X_train.values)
X_train_count = X_train_count.toarray()
X_train_count[:1]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [16]:
model = MultinomialNB()
model.fit(X_train_count, y_train.values.ravel())

MultinomialNB()

In [17]:
X_test_count = v.transform(X_test.values)
X_test_count = X_test_count.toarray()
y_pred = model.predict(X_test_count)
print(accuracy_score(y_test, y_pred))

0.967594433063287


In [19]:
# from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors=3)
# knn.fit(X_train_count, y_train)
# predictions = knn.predict(X_test_count)
# print(accuracy_score(y_test, predictions))

In [20]:
# from sklearn.tree import DecisionTreeClassifier
# dtree = DecisionTreeClassifier()
# dtree.fit(X_train_count, y_train)
# predictions = dtree.predict(X_test_count)
# print(accuracy_score(y_test, predictions))

In [21]:
# from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier(n_estimators=10)
# rfc.fit(X_train_count, y_train.values.ravel())
# predictions = rfc.predict(X_test_count)
# print(accuracy_score(y_test, predictions))

In [22]:
# # selecting columns for submission file
# test=test.reindex(columns= ['id', 'before', 'after'])
# test=test.iloc[:,:3]
# test.head()

## Use Neural Networks to classify `class`

In [20]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [21]:
import numpy as np
labelencoder_y_train = LabelEncoder()
y_train = labelencoder_y_train.fit_transform(y_train)
y_train = y_train.reshape(-1, 1)
print(np.max(y_train))

labelencoder_y_test = LabelEncoder()
y_test = labelencoder_y_test.fit_transform(y_test)
y_test = y_test.reshape(-1, 1)

print(y_train)
print(y_test)

15
[[11]
 [11]
 [11]
 ...
 [11]
 [11]
 [11]]
[[11]
 [10]
 [12]
 ...
 [11]
 [11]
 [12]]


In [22]:
onehotencoder = OneHotEncoder(categories="auto")
y_train = onehotencoder.fit_transform(y_train).toarray()
y_test = onehotencoder.fit_transform(y_test).toarray()

In [26]:
print(y_train.shape)

(6896551, 16)


In [23]:
from keras.models import Sequential
from keras.layers import Dense

In [24]:
nn = Sequential()
nn.add(Dense(60, activation='relu')) # 10 features
nn.add(Dense(40, activation='relu')) # between
nn.add(Dense(16, activation='softmax'))

nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [25]:
nn.fit(X_train_count, y_train)



<tensorflow.python.keras.callbacks.History at 0x19c906f8bb0>

In [26]:
# y_pred here refers to the 30% of training data
y_pred = nn.predict(X_test_count)
print(y_pred.shape)
y_pred = y_pred.argmax(axis=1)

print(y_pred.shape)
print(y_test.shape)
print(y_test)

(2955665, 16)
(2955665,)
(2955665, 16)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [27]:
y_test = y_test.argmax(axis=1)
print(accuracy_score(y_pred, y_test))

0.9919679665997331


In [33]:
print(confusion_matrix(y_pred, y_test))

[[     72       1       3       0       0       0       0       0       1
        0       0       0       0       0       0       0]
 [      9   38622     144      56    1325       0       6      39       5
        1     260      31       0     111       0       7]
 [     18    1325   77185       1     298       3      12       1      28
       26       0       0       0     157       4       5]
 [      0       0       1    2924       0       1       0       0       1
        5       0       0       0       0       3       0]
 [      0       0       0       0       0       0       0       0       0
        0       0       0       0       4       0       0]
 [      0       0       4       0       0    1486       0       2       6
        1       0       5       0       0       3       8]
 [      0       0      14       0       0       0     306       0       0
        0       0       0       0       0       0       0]
 [      0      67       3       0       0      18       0   34409    

In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.77      0.77       156
           1       0.95      0.96      0.96     40188
           2       0.98      1.00      0.99     77449
           3       0.99      0.98      0.98      2892
           4       0.00      0.00      0.00      1632
           5       0.98      0.96      0.97      1486
           6       0.90      1.00      0.95       311
           7       0.89      0.75      0.81     45471
           8       0.98      0.98      0.98      4099
           9       0.99      0.97      0.98      1590
          10       0.99      0.87      0.92      3852
          11       0.99      1.00      1.00   2204128
          12       1.00      1.00      1.00    561073
          13       0.96      0.80      0.87      1155
          14       0.99      0.96      0.97       448
          15       0.71      0.89      0.79      9735

    accuracy                           0.99   2955665
   macro avg       0.88   

In [29]:
X_test_count = v.transform(test["before"])
y_pred = nn.predict(X_test_count)
y_pred = y_pred.argmax(axis=1)
print(y_pred.shape)
print(y_test.shape)
print(y_test)

(1081965,)
(2955665,)
[11 10 12 ... 11 11 12]


In [30]:
print(labelencoder_y_train.classes_)
print(labelencoder_y_test.classes_)

['ADDRESS' 'CARDINAL' 'DATE' 'DECIMAL' 'DIGIT' 'ELECTRONIC' 'FRACTION'
 'LETTERS' 'MEASURE' 'MONEY' 'ORDINAL' 'PLAIN' 'PUNCT' 'TELEPHONE' 'TIME'
 'VERBATIM']
['ADDRESS' 'CARDINAL' 'DATE' 'DECIMAL' 'DIGIT' 'ELECTRONIC' 'FRACTION'
 'LETTERS' 'MEASURE' 'MONEY' 'ORDINAL' 'PLAIN' 'PUNCT' 'TELEPHONE' 'TIME'
 'VERBATIM']


In [47]:
y_pred = [labelencoder_y_train.classes_[y] for y in y_pred]
y_pred

['PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PUNCT',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PUNCT',
 'PLAIN',
 'PUNCT',
 'PLAIN',
 'PUNCT',
 'CARDINAL',
 'PLAIN',
 'PUNCT',
 'PLAIN',
 'PUNCT',
 'CARDINAL',
 'PLAIN',
 'PUNCT',
 'PLAIN',
 'PUNCT',
 'CARDINAL',
 'PLAIN',
 'PUNCT',
 'VERBATIM',
 'PUNCT',
 'PUNCT',
 'DATE',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PUNCT',
 'MEASURE',
 'PUNCT',
 'VERBATIM',
 'PLAIN',
 'CARDINAL',
 'PLAIN',
 'PLAIN',
 'PUNCT',
 'PLAIN',
 'PUNCT',
 'PUNCT',
 'PUNCT',
 'PLAIN',
 'CARDINAL',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'LETTERS',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PUNCT',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'CARDINAL',
 'PLAIN',
 'PLAIN',
 'PUNCT',
 'MEASURE',
 'PUNCT',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'PLAIN',
 'CARDINAL',
 'PLAIN',
 'PUNCT',
 'PUNCT',
 'PLAIN',
 'PLAIN'

In [31]:
test["class"] = y_pred
test

Unnamed: 0,sentence_id,token_id,class,before,after
0,0,0,11,Another,
1,0,1,11,religious,
2,0,2,11,family,
3,0,3,11,is,
4,0,4,11,of,
...,...,...,...,...,...
1088559,69999,13,11,close,
1088560,69999,14,11,to,
1088561,69999,15,11,machine,
1088562,69999,16,11,languages,


In [32]:
test.to_csv("en_test2.csv", index=False)
train.to_csv("en_train2.csv", index=False)