In [2]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
from imblearn.over_sampling import ADASYN
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils import resample
from matplotlib import pyplot as plt
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV
import numpy as np
from collections import Counter
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/vidhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#getting the dataset and loading it
csv_file = pd.read_csv('/Users/vidhi/Desktop/Assesments/rsics_dataset/tagged_selections_by_sentence.csv')

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/vidhi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#the labels and selected sentences are taken from the file and stored in dataframe
selected = csv_file['Selected']
labels = csv_file['Greeting']
df = pd.DataFrame(selected)
labels = pd.DataFrame(labels)

In [6]:
#getting the both columns
dataf = csv_file[['Selected', 'Greeting']]

In [7]:
#printing the labels total 
Counter(csv_file['Greeting'])

Counter({0: 5957, 1: 802})

In [8]:
# Separate majority and minority classes as 1 labels are less as compared to 0
data_majority = dataf[dataf['Greeting'] == 0]
data_minority = dataf[dataf['Greeting'] == 1]
bias = data_minority.shape[0]/data_majority.shape[0]
# lets split train/test data first then 
train = pd.concat([data_majority.sample(frac=0.8,random_state=200),
         data_minority.sample(frac=0.8,random_state=200)])
test = pd.concat([data_majority.drop(data_majority.sample(frac=0.8,random_state=200).index),
        data_minority.drop(data_minority.sample(frac=0.8,random_state=200).index)])

train = shuffle(train)
test = shuffle(test)

In [9]:
#separating the label from training data and perform upsamping 

data_majority = train[train['Greeting'] == 0]
data_minority = train[train['Greeting'] == 1]
# Upsample minority class
data_minority_upsampled = resample(data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples= 2500,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])

print("After upsampling\n",data_upsampled.Greeting.value_counts(),sep = "")

After upsampling
0    4766
1    2500
Name: Greeting, dtype: int64


In [10]:
#performing preprocessing 

def preprocessing(text):
    stemmer = WordNetLemmatizer()
    document = re.sub(r'\W', ' ', str(text))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    return document

In [11]:
#getting the upsampled training data 
training_data = np.array(data_upsampled['Selected'])
testing_data = np.array(test['Selected'])

y_train = np.array(data_upsampled['Greeting'])
y_test = np.array(test['Greeting'])

In [12]:
y_test.shape

(1351,)

In [13]:
#Processing the training and test data separately
X_train = []
X_test = []

for i in range(len(training_data)):
    X_train.append(preprocessing(training_data[i]))
    
for i in range(len(testing_data)):
    X_test.append(preprocessing(testing_data[i]))

In [14]:
#Tokenizing and making a vocab
corpus = np.array(X_train)
vectorizer = CountVectorizer(decode_error="replace")
vec_train = vectorizer.fit_transform(corpus).toarray()
#Save vectorizer.vocabulary_
pickle.dump(vectorizer.vocabulary_,open("feature.pkl","wb"))

In [15]:
#Fitting the data in multinomial naive bayes
#a specialized version of naive bayes designed to handle text documents using word counts

nb = MultinomialNB()

nb.fit(vec_train, y_train)

nb.score(vec_train, y_train)

0.9606385906963941

In [16]:
pickle.dump(nb,open("classifier.pkl","wb"))

In [18]:
#Load the vocab later from the saved model and predicting test data
transformer = TfidfTransformer()
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("feature.pkl", "rb")))
Test_data = transformer.fit_transform(loaded_vec.fit_transform(X_test)).toarray()

In [19]:
#predicting the test data
y_pred = nb.predict(Test_data)

In [20]:
#classification report
#print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1191
           1       0.71      0.71      0.71       160

    accuracy                           0.93      1351
   macro avg       0.84      0.83      0.83      1351
weighted avg       0.93      0.93      0.93      1351

0.9311621021465581


In [21]:
#Predicting a new data
q1 = preprocessing('Hello, Both got canceled even though they were confirmed about three times. Had to take another airline back skipped last flight and took the train back instead. Had paid for the overseas flight extra for economy plus seats and of course didnt get them since the flight got canceled and we flew with another carrier. Thanks, Undine')
list1 = []
list1.append(q1)
#Load it later
transformer = TfidfTransformer()
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("feature.pkl", "rb")))
tfidf = transformer.fit_transform(loaded_vec.fit_transform(list1)).toarray()

In [22]:
nb.predict(tfidf)

array([1])