In [None]:
NLP analysis 

In [None]:
import datetime, time, psutil, os
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction import stop_words
#from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

from sklearn.naive_bayes import GaussianNB,MultinomialNB,ComplementNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
print(str(datetime.datetime.now()) + "   start of baseline model")
#  CONTROL PARAMTERS
size = 10
tfTransform = True
filename = "GB_SIZE_FEATURE_"+str(size)+"_"+str(int(time.time()))+".txt"
print(filename)
f = open(filename, "w")
f.write('environment \n')
f.write(str(platform.uname()) + '\n')
f.write(str(psutil.virtual_memory()) + '\n')
f.write(str(psutil.disk_partitions()) + '\n\n')
f.flush()

In [None]:
f.write( 'control parameter: size {} , tf transform {}\n\n'.format(size, tfTransform))

In [None]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

f.write("train_data shape: "+str(train_data.shape)+ '\n')
f.write("test_data shape: "+str(test_data.shape)+ '\n\n')

In [None]:
train_data = train_data[:len(train_data)//size] 
test_data = test_data[:len(test_data)//size] 

f.write("applied train_data shape: "+str(train_data.shape)+ '\n')
f.write("applied test_data shape: "+str(test_data.shape)+ '\n\n')
f.flush()
train_data.head()

In [None]:
train_data['target'].value_counts()

In [None]:
train_data['num_words'] = train_data['question_text'].apply(lambda x: len(str(x).split()) )

In [None]:
train_data['num_words'].describe()

In [None]:
start_time = time.time()

train_text = train_data['question_text']
test_text = test_data['question_text']
train_target = train_data['target']
all_text = train_text.append(test_text)

#count_vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
count_vectorizer = CountVectorizer()
count_vectorizer.fit(all_text)

elapsed_time = time.time() - start_time
print(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))))
f.write(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   vectorization\n")
f.flush()

In [None]:
start_time = time.time()

train_text_features_cv = count_vectorizer.transform(train_text)
test_text_features_cv = count_vectorizer.transform(test_text)

print(train_text_features_cv.get_shape())
print(test_text_features_cv.get_shape())

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(all_text)
train_text_features_tf = tfidf_vectorizer.transform(train_text)
test_text_features_tf = tfidf_vectorizer.transform(test_text)

print(train_text_features_tf.get_shape())
print(test_text_features_tf.get_shape())

if tfTransform :
    train_text_features = train_text_features_tf
    test_text_features = test_text_features_tf
else:
    train_text_features = train_text_features_cv
    test_text_features = test_text_features_cv

    
elapsed_time = time.time() - start_time
print(train_text_features.get_shape())
print(test_text_features.get_shape())
print(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   feature transform")

f.write("train_text_features_cv" +str(train_text_features_cv.get_shape())+"\n")
f.write("test_text_features_cv" +str(test_text_features_cv.get_shape())+"\n")
f.write("train_text_features_tf" +str(train_text_features_tf.get_shape())+"\n")
f.write("test_text_features_tf" +str(test_text_features_tf.get_shape())+"\n")
f.write("train_text_features" +str(train_text_features.get_shape())+"\n")
f.write("test_text_features" +str(test_text_features.get_shape())+"\n")
f.write(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   feature transform\n")
f.flush()

In [None]:
len(np.zeros([train_data.shape[0],]))

In [None]:
#BernoulliNB
start_time = time.time()

kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)

test_preds = 0
oof_preds = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features[train_idx,:], train_text_features[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier = BernoulliNB()
    classifier.fit(x_train,y_train)
   
    oof_preds[valid_idx] = classifier.predict_proba(x_valid)[:,1]
    test_preds += 0.2*classifier.predict_proba(test_text_features)[:,1]

pred_train = (oof_preds > 0.25).astype(np.int)


elapsed_time = time.time() - start_time
print(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   BernoulliNB")
print("f1_score: " +str(f1_score(train_target, pred_train)))
f.write(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   BernoulliNB\n")
f.write("f1_score: " +str(f1_score(train_target, pred_train)) +"\n")
f.flush()

In [None]:
#LogisticRegression
start_time = time.time()

kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)

test_preds = 0
oof_preds = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features[train_idx,:], train_text_features[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier = LogisticRegression(solver='liblinear')
    classifier.fit(x_train,y_train)
   
    oof_preds[valid_idx] = classifier.predict_proba(x_valid)[:,1]
    test_preds += 0.2*classifier.predict_proba(test_text_features)[:,1]

pred_train = (oof_preds > 0.25).astype(np.int)


elapsed_time = time.time() - start_time
print(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   LogisticRegression")
print("f1_score: " +str(f1_score(train_target, pred_train)))
f.write(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   LogisticRegression\n")
f.write("f1_score: " +str(f1_score(train_target, pred_train)) +"\n")
f.flush()

In [None]:
#GradientBoostingClassifier
start_time = time.time()

kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)

test_preds = 0
oof_preds = np.zeros([train_data.shape[0],])

for i, (train_idx,valid_idx) in enumerate(kfold.split(train_data)):
    x_train, x_valid = train_text_features[train_idx,:], train_text_features[valid_idx,:]
    y_train, y_valid = train_target[train_idx], train_target[valid_idx]
    classifier = GradientBoostingClassifier(random_state=1)
    classifier.fit(x_train,y_train)
   
    oof_preds[valid_idx] = classifier.predict_proba(x_valid)[:,1]
    test_preds += 0.2*classifier.predict_proba(test_text_features)[:,1]

pred_train = (oof_preds > 0.25).astype(np.int)


elapsed_time = time.time() - start_time
print(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   GradientBoostingClassifier")
print("f1_score: " +str(f1_score(train_target, pred_train)))
f.write(str(time.strftime("%H:%M:%S:%m", time.gmtime(elapsed_time))) + "   GradientBoostingClassifier\n")
f.write("f1_score: " +str(f1_score(train_target, pred_train)) +"\n")
f.flush()

In [None]:
print(str(datetime.datetime.now())+ "   end of baseline model")
f.write(str(datetime.datetime.now())+ "   end of baseline model\n\n")

In [None]:
f.close()