In [None]:
from sklearn import linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import ensemble
from sklearn.model_selection import KFold
import pandas as pd
import pandas, xgboost
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stopWord = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

######################################################## 1. Dataset preparation
# load the dataset
# train Data
df_trainData = pd.read_csv("D:\\NTU MSIS\\H6751-TEXT & WEB MINING\\Kaggle Competition\\Data\\train.csv")
# test Data
df_testData = pd.read_csv("D:\\NTU MSIS\\H6751-TEXT & WEB MINING\\Kaggle Competition\\Data\\test.csv")

# Remove HTML tag
df_trainData['Comment'] = df_trainData['Comment'].str.replace('<.*?>', '', case=False)
# Remove stop word
df_trainData['Comment'] = df_trainData['Comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopWord)]))
# Remove hyper link
df_trainData['Comment'] = df_trainData['Comment'].str.replace('http\S+|www.\S+', '', case=False)
# Remove sepecial characer
df_trainData['Comment'] = df_trainData['Comment'].replace({'[^A-Za-z]+':' '}, regex=True)
# Remove Single character
df_trainData['Comment'] = df_trainData['Comment'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 1 ]))
# Lemmatization  
df_trainData['Comment'] = df_trainData['Comment'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))  

# Remove HTML tag
df_testData['Comment'] = df_testData['Comment'].str.replace('<.*?>', '', case=False)
# Remove stop word
df_testData['Comment'] = df_testData['Comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopWord)]))
# Remove hyper link
df_testData['Comment'] = df_testData['Comment'].str.replace('http\S+|www.\S+', '', case=False)
# Remove sepecial characer
df_testData['Comment'] = df_testData['Comment'].replace({'[^A-Za-z]+':' '}, regex=True)
# Remove Single character
df_testData['Comment'] = df_testData['Comment'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 1 ]))
# Lemmatization  
df_testData['Comment'] = df_testData['Comment'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))  

# prepare cross validation
X = df_trainData['Comment']
y = df_trainData['Outcome']

kf = KFold(n_splits=2)
kf.get_n_splits(df_trainData['Comment'])

def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    return metrics.accuracy_score(predictions, valid_y)

x = 0
for train_index, test_index in kf.split(X):
    x += 1
    print("************************************Kfold: " + str(x))
    train_x, valid_x = X[train_index], X[test_index]
    train_y, valid_y = y[train_index], y[test_index]
    
    # create a dataframe using texts and lables
    trainDF = pandas.DataFrame()
    trainDF['text'] = df_trainData['Comment']

    ######################################################## 2. Feature Engineering
    ############ 2.1 Count Vectors as features
    # create a count vectorizer object 
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
    count_vect.fit(trainDF['text'])
    # transform the training and validation data using count vectorizer object
    xtrain_count =  count_vect.transform(train_x)
    xvalid_count =  count_vect.transform(valid_x)
    
    ############ 2.2 TF-IDF Vectors as features
    # word level tf-idf
    tfidf_vect = TfidfVectorizer(min_df = 5,  max_df = 0.8, sublinear_tf = True, use_idf = True, analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
    tfidf_vect.fit(trainDF['text'])
    xtrain_tfidf =  tfidf_vect.transform(train_x)
    xvalid_tfidf =  tfidf_vect.transform(valid_x)
    # ngram level tf-idf 
    tfidf_vect_ngram = TfidfVectorizer(min_df = 5,  max_df = 0.8, sublinear_tf = True, use_idf = True, analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
    tfidf_vect_ngram.fit(trainDF['text'])
    xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
    xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
    
    ######################################################## 3. Model Building
    ############3.1 Naive Bayes
    # Naive Bayes on Count Vectors
    accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
    print("........................................Naive Bayes, Count Vectors: ", accuracy)
    # Naive Bayes on Word Level TF IDF Vectors
    accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
    print("........................................Naive Bayes, WordLevel TF-IDF: ", accuracy)
    # Naive Bayes on Ngram Level TF IDF Vectors
    accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print("........................................Naive Bayes, N-Gram Vectors: ", accuracy)
    print("")
    
    ############3.2 Linear Classifier
    # Linear Classifier on Count Vectors
    accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
    print("........................................Logistic Regression, Count Vectors: ", accuracy)
    # Linear Classifier on Word Level TF IDF Vectors
    accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
    print("........................................Logistic Regression, WordLevel TF-IDF: ", accuracy)
    # Linear Classifier on Ngram Level TF IDF Vectors
    accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print("........................................Logistic Regression, N-Gram Vectors: ", accuracy)
    print("")
    
    ############3.3 SVM Model
    # SVM on Count Vectors
    accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
    print("........................................SVM, Count Vectors: ", accuracy)
    # SVM on Word Level TF IDF Vectors
    accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
    print("........................................SVM, WordLevel TF-IDF: ", accuracy)
    # SVM on Ngram Level TF IDF Vectors
    accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print("........................................SVM, N-Gram Vectors: ", accuracy)
    print("")
    
    ############3.4 Bagging Model
    # RandomForest on Count Vectors
    accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
    print("........................................RandomForest, Count Vectors: ", accuracy)
    # RandomForest on Word Level TF IDF Vectors
    accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
    print("........................................RandomForest, WordLevel TF-IDF: ", accuracy)
    # RandomForest on Ngram Level TF IDF Vectors
    accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
    print("........................................RandomForest, N-Gram Vectors: ", accuracy)
    print("")
    
    ############3.5 Boosting Model
    # Gradient Boosting on Count Vectors
    accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
    print("........................................Xgb, Count Vectors: ", accuracy)
    # Gradient Boosting on Word Level TF IDF Vectors
    accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
    print("........................................Xgb, WordLevel TF-IDF: ", accuracy)
    # Gradient Boosting on Ngram Level TF IDF Vectors
    accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram.tocsc())
    print("........................................Xgb, N-Gram Vectors: ", accuracy)
    print("")

print('------ DONE ------')