In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

from pylab import rcParams
rcParams['figure.figsize'] = 10,5

In [None]:
# Importing Dl and NLP libraries
import nltk
import keras
import re

from wordcloud import WordCloud

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,Flatten,GlobalAveragePooling1D

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

In [None]:
# Object iniatilize
eng_words = set(stopwords.words('english'))
p_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# Importing train and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
# Data concat
data = pd.concat([train_data,test_data],0,sort=False)

In [None]:
# Having a look at data
data.head()

In [None]:
# Variables creation
data_id = data['id']
Y = data['target']

In [None]:
# Clean data function
def clean_text(corpus):
    clean_corpus = []
    for i in range(len(corpus)):
        sentence = re.sub('[^a-zA-Z]',' ',corpus[i])
        sentence = sentence.lower()
        sentence = sentence.split()
        sentence = [p_stemmer.stem(word) for word in sentence if word not in eng_words]
        sentence = ' '.join(sentence)
        clean_corpus.append(sentence)
    return clean_corpus

In [None]:
# Cleaning train data
corpus = clean_text(data['text'].values)

In [None]:
# Word cloud
sent_of_string = ' '.join(corpus)
plt.imshow(WordCloud().generate(sent_of_string))

In [None]:
# Using Naive Bayes

# Count Vectorizer
cv = CountVectorizer()
corpus_cv = cv.fit_transform(corpus).toarray()

# Train | Validation_data
train_x = corpus_cv[:7613]
train_y = Y.iloc[:7613]
validation = corpus_cv[7613:]

# Dividing train data into train and test
xtrain,xtest,ytrain,ytest = train_test_split(train_x,train_y,test_size=0.2,random_state=0)

# Model buildings - Naive Bayes
nb = MultinomialNB()
nb.fit(xtrain,ytrain)
pred_nb = nb.predict(xtest)

# Desired results
print(accuracy_score(ytest,pred_nb))
print(f1_score(ytest,pred_nb))

In [None]:
# Using Random Forest

# Count Vectorizer
cv = CountVectorizer()
corpus_cv = cv.fit_transform(corpus).toarray()

# Train | Validation_data
train_x = corpus_cv[:7613]
train_y = Y.iloc[:7613]
validation = corpus_cv[7613:]

# Dividing train data into train and test
xtrain,xtest,ytrain,ytest = train_test_split(train_x,train_y,test_size=0.2,random_state=0)

# Model buildings - Xg Boost
rf = RandomForestClassifier(random_state=0)
rf.fit(xtrain,ytrain)
pred_rf = rf.predict(xtest)

# Desired results
print(accuracy_score(ytest,pred_rf))
print(f1_score(ytest,pred_rf))

In [None]:
# Tf-Idf Vectorizer
tf = TfidfVectorizer()
corpus_tf = tf.fit_transform(corpus).toarray()

# Train | Validation_data
train_x = corpus_tf[:7613]
train_y = Y.iloc[:7613]
#validation = corpus_cv[7613:]

# Dividing train data into train and test
xtrain,xtest,ytrain,ytest = train_test_split(train_x,train_y,test_size=0.2,random_state=0)

# Model buildings
nb = MultinomialNB()
nb.fit(xtrain,ytrain)
pred_nb = nb.predict(xtest)

# Desired results
print(accuracy_score(ytest,pred_nb))
print(f1_score(ytest,pred_nb))

In [None]:
# Word Embedding and LSTM
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

sequence = tokenizer.texts_to_sequences(corpus)
padded_sequence = pad_sequences(sequence)

train_x = padded_sequence[:7613]

# Dividing train data into train and test
xtrain,xtest,ytrain,ytest = train_test_split(train_x,train_y,test_size=0.2,random_state=0)

model = Sequential()
model.add(Embedding(50000,32,input_length=27))
model.add(LSTM(30,activation='relu',dropout=0.1))
model.add(Dense(units=1,activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
history = model.fit(xtrain,ytrain.map(int),batch_size=32,epochs=30)

In [None]:
plt.plot(history.history['accuracy'])

In [None]:
pred_lstm = model.predict(xtest)
pred_lstm = pred_lstm>0.5

# Desired results
print(accuracy_score(ytest,pred_lstm))
print(f1_score(ytest,pred_lstm))

In [None]:
# Final data submission
predictions = nb.predict(validation).astype(int)
out_of_sample_ids = data_id[7613:]
submission = pd.DataFrame({'id':out_of_sample_ids,'target':predictions})
submission.to_csv('Final_Submission.csv',index=False)