In [56]:
# import packages
import numpy as np
import pandas as pd
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score ,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC



### Load the dataset

- Load the train data and using all your knowledge try to explore the different statistical properties of the dataset.

In [57]:
# Code starts here
train_data = pd.read_csv(r'C:\Users\Vinit.Shetty\Documents\DS\UCL- NLP1\train.csv')
# Code ends here

### Visualize and Preprocess the data

- Retaining only alphabets (Using regular expressions)
- Removing stopwords (Using nltk library)

In [58]:
# Code starts here

stop = set(stopwords.words('english'))

# retain only alphabets
train_data['TITLE'] = train_data['TITLE'].apply(lambda x:re.sub("[^a-zA-Z]", " ",x))

# convert to lowercase and tokenize
train_data['TITLE'] = train_data['TITLE'].apply(lambda x:x.lower().split())

# remove stopwords
train_data['TITLE'] = train_data['TITLE'].apply(lambda x:[i for i in x if i not in stop])

# join list elements
train_data['TITLE'] = train_data['TITLE'].apply(lambda x: ' '.join(x))

# split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(train_data["TITLE"], train_data["CATEGORY"],
                                                                            test_size = 0.2,random_state=3)

# Code ends here

### Model building

- Now let's come to the actual task, using any classifier, predict the `CATEGORY`. Use different techniques you have learned to imporove the performance of the model.
- Try improving upon the `accuracy_score` ([Accuracy Score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html))

In [59]:
# initialize count vectorizer
count_vectorizer = CountVectorizer()

# initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# fit and transform with count vectorizer
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)


# fit and transform with tfidf vectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [60]:
nb_1 = MultinomialNB()
nb_2 = MultinomialNB()

# fit on count vectorizer training data
nb_1.fit(X_train_count, Y_train)

# fit on tfidf vectorizer training data
nb_2.fit(X_train_tfidf, Y_train)

# accuracy with count vectorizer
acc_count_nb = accuracy_score(nb_1.predict(X_test_count), Y_test)

# accuracy with tfidf vectorizer
acc_tfidf_nb = accuracy_score(nb_2.predict(X_test_tfidf), Y_test)

# display accuracies
print(acc_count_nb, acc_tfidf_nb)

0.9272641188394218 0.9295870507642002


In [61]:
import warnings
warnings.filterwarnings('ignore')

# initialize logistic regression
logreg_1 = OneVsRestClassifier(LogisticRegression(random_state=10))
logreg_2 = OneVsRestClassifier(LogisticRegression(random_state=10))

# fit on count vectorizer training data
logreg_1.fit(X_train_count, Y_train)

# fit on tfidf vectorizer training data
logreg_2.fit(X_train_tfidf, Y_train)

# accuracy with count vectorizer
acc_count_logreg = accuracy_score(logreg_1.predict(X_test_count), Y_test)

# accuracy with tfidf vectorizer
acc_tfidf_logreg = accuracy_score(logreg_2.predict(X_test_tfidf), Y_test)

# display accuracies
print(acc_count_logreg, acc_tfidf_logreg)

0.9462026721115008 0.9420746593279773


### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [64]:
# Code Starts here
# Prediction on test data
test = pd.read_csv(r'C:\Users\Vinit.Shetty\Documents\DS\UCL- NLP1\test.csv')
Id = test['Id']
test['TITLE'] = test['TITLE'].apply(lambda x:re.sub("[^a-zA-Z]", " ",x))

# convert to lowercase and tokenize
test['TITLE'] = test['TITLE'].apply(lambda x:x.lower().split())

# remove stopwords
test['TITLE'] = test['TITLE'].apply(lambda x:[i for i in x if i not in stop])

# join list elements
test['TITLE'] = test['TITLE'].apply(lambda x: ' '.join(x))


test_count = count_vectorizer.transform(test['TITLE'])
test_tfidf = tfidf_vectorizer.transform(test['TITLE'])



#test['TITLE'] = test['TITLE'].apply(preprocess)
test_prediction = logreg_1.predict(test_count)
submission = pd.DataFrame({'Id':Id,'CATEGORY':test_prediction})
filename = 'submission.csv'
submission.to_csv(filename,index=False)
print('Saved file: ' + filename)
# Code ends here

Saved file: submission.csv
