In [1]:
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from string import digits
from nltk.corpus import stopwords
import string
import nltk
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Preprocessing the data

In [2]:
"""
Not removing digits as date-time may also lead to a pattern as per problem statement
By analysing the data, Key words like "Please" "Could" should not be removed as they are also leading to a pattern.
So here I am just removing stopwords which has length <=3
"""
lemmatizer = WordNetLemmatizer()
def preProcess(sentence):
    stop = set(stopwords.words('english'))
    remove_punctuation = str.maketrans('','',string.punctuation)
#     remove_digits = str.maketrans('','',digits)
    sentence = sentence.translate(remove_punctuation)
#     sentence = sentence.translate(remove_digits).translate(remove_punctuation)
    tokens = [i for i in sentence.lower().split() if (i not in stop or len(i) > 3)]
    s=""
    for token in tokens:
        token = lemmatizer.lemmatize(token)
        s = s + ' ' + token
    return s;

# Loading the data from train and test files

In [3]:
df = pd.read_csv("/home/varun/Downloads/train.csv")

In [4]:
df["NewText"] = ""
for i in range(df.shape[0]):
    df["NewText"][i] = preProcess(df["Text"][i])
df['Tag'] = df['Label'].eq('Yes').astype(int)    

In [5]:
df.head()

Unnamed: 0,Label,Text,NewText,Tag
0,No,>>> [1]Contact Me Now to Make $100 Today!$LINK,1contact make 100 todaylink,0
1,No,Act now to keep your life on the go!,act keep your life go,0
2,No,Choose between $500 and $10000 dollars with up...,choose between 500 10000 dollar with 5 year r...,0
3,No,Click above to earn today.,click above earn today,0
4,No,Click here to receive your first $10 today:,click here receive your first 10 today,0


In [6]:
df1 = pd.read_csv("/home/varun/Downloads/test.csv")
df1["NewText"] = ""
for i in range(df1.shape[0]):
    df1["NewText"][i] = preProcess(df1["Text"][i])
df1['Tag'] = df1['Label'].eq('Yes').astype(int) 

# Combining train and test data to keep the vocab same for TFIDF 

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)

In [8]:
list1  = df["NewText"].tolist()
list2  = df1["NewText"].tolist() 

In [9]:
for i in range(len(list2)):
    list1.append(list2[i])

In [10]:
X = vectorizer.fit_transform(list1)
print(X.shape)

(4649, 7190)


## Splitting the data as per train and test file

In [11]:
"""
As we appended test data for vectorizing, 
we put last 992 files in the X_test and first 3657 files in the X_train
"""

X_train = X[:3657]
X_test = X[3657:]
y_train = df['Tag'].tolist()
y_test = df1['Tag'].tolist()
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
y_train = np.reshape(y_train,(3657,1))
y_test = np.reshape(y_test,(992,1))

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3657, 7190)
(992, 7190)
(3657, 1)
(992, 1)


# KNN Classifier
## accuracy = 65.9%

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [14]:
acc_list = []

In [None]:
for i in range(300):
    if(i > 1):
#         print(X_train.shape)
#         print(X_test.shape)
#         print(y_train.shape)
#         print(y_test.shape)
        knn = KNeighborsClassifier(n_neighbors= i)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        acc_list.append(accuracy_score(y_test,y_pred))

In [16]:
max(acc_list)

0.6592741935483871

In [17]:
confusion_matrix(y_test, y_pred)

array([[350, 333],
       [ 42, 267]])

In [18]:
print("Recall : ", precision_score(y_test, y_pred))
print("Precision : ", recall_score(y_test, y_pred))

Recall :  0.445
Precision :  0.8640776699029126


# Naive Bayes Classifier 
## accuracy = 51.5%

In [19]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [20]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
accuracy_score(y_test,y_pred)

  y = column_or_1d(y, warn=True)


0.5151209677419355

In [21]:
confusion_matrix(y_test, y_pred)

array([[280, 403],
       [ 78, 231]])

In [22]:
print("Recall : ", precision_score(y_test, y_pred))
print("Precision : ", recall_score(y_test, y_pred))

Recall :  0.3643533123028391
Precision :  0.7475728155339806


# SVM Classifier
## accuracy = 77.6%

In [23]:
X_train = X[:3657]
X_test = X[3657:]
y_train = df['Tag'].tolist()
y_test = df1['Tag'].tolist()
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
y_train = np.reshape(y_train,(3657,1))
y_test = np.reshape(y_test,(992,1))

In [24]:
from sklearn.svm import SVC
clf = SVC(C=2300, kernel='rbf')
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=2300, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [25]:
y_pred = clf.predict(X_test)

In [26]:
accuracy_score(y_test,y_pred)

0.7762096774193549

In [27]:
confusion_matrix(y_test, y_pred)

array([[557, 126],
       [ 96, 213]])

In [28]:
print("Recall : ", precision_score(y_test, y_pred))
print("Precision : ", recall_score(y_test, y_pred))

Recall :  0.6283185840707964
Precision :  0.6893203883495146


# Decesion Tree
## accuracy = 64.4%

In [29]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [30]:
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.657258064516129

In [31]:
confusion_matrix(y_test, y_pred)

array([[433, 250],
       [ 90, 219]])

In [32]:
print("Recall : ", precision_score(y_test, y_pred))
print("Precision : ", recall_score(y_test, y_pred))

Recall :  0.4669509594882729
Precision :  0.7087378640776699


# XGBoost Classifier
# accuracy  = 80.5%

In [33]:
from xgboost import XGBClassifier

In [34]:
model = XGBClassifier()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [35]:
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

0.8054435483870968

In [36]:
confusion_matrix(y_test, y_pred)

array([[604,  79],
       [114, 195]])

In [37]:
print("Recall : ", precision_score(y_test, y_pred))
print("Precesion : ", recall_score(y_test, y_pred))

Recall :  0.7116788321167883
Precesion :  0.6310679611650486
