In [103]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spooky-author-identification/train.zip
/kaggle/input/spooky-author-identification/test.zip
/kaggle/input/spooky-author-identification/sample_submission.zip


In [104]:
from sklearn.model_selection import KFold

In [105]:
train_df = pd.read_csv('/kaggle/input/spooky-author-identification/train.zip') 
test_df = pd.read_csv('/kaggle/input/spooky-author-identification/test.zip')

In [106]:
feature_drop = ['id', 'text']
train_X = train_df.drop(feature_drop + ['author'], axis = 1)
test_X = test_df.drop(feature_drop, axis = 1)
train_y = train_df['author']

In [107]:
mapping_y = {'EAP': 0, 'HPL': 1, 'MWS': 2}
train_y = train_y.map(mapping_y)

In [108]:
#feature 1
train_X['num_char'] = train_df['text'].apply(lambda x: len(str(x)))
test_X['num_char'] = test_df['text'].apply(lambda x: len(str(x)))

In [109]:
#feature 2
train_X['num_word'] = train_df['text'].apply(lambda x: len(str(x).split()))
test_X['num_word'] = test_df['text'].apply(lambda x: len(str(x).split()))

In [110]:
#feature 3
train_X['num_unique_word'] = train_df['text'].apply(lambda x: len(set(str(x).split())))
test_X['num_unique_word'] = test_df['text'].apply(lambda x: len(set(str(x).split())))

In [111]:
#feature 4
train_X['num_word_upper'] = train_df['text'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_X['num_word_upper'] = test_df['text'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

In [112]:
#feature_5
train_X['num_word_title'] = train_df['text'].apply(lambda x : len([w for w in str(x).split() if w.istitle()]))
test_X['num_word_title'] = test_df['text'].apply(lambda x : len([w for w in str(x).split() if w.istitle()]))

In [113]:
import string

In [114]:
#feature 6
train_X['num_word_punctuation'] = train_df['text'].apply(lambda x: len([w for w in str(x).split() if w in string.punctuation]))
test_X['num_word_punctuation'] = test_df['text'].apply(lambda x: len([w for w in str(x).split() if w in string.punctuation]))

In [115]:
from nltk.corpus import stopwords
stop_word = set(stopwords.words('english'))

In [116]:
#feature 7
train_X['num_stopword'] = train_df['text'].apply(lambda x: len([w for w in str(x).split() if w in stop_word]))
test_X['num_stopword'] = test_df['text'].apply(lambda x: len([w for w in str(x).split() if w in stop_word]))

In [117]:
import numpy as np

In [118]:
#feature 8
train_X['mean_word'] = train_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_X['mean_word'] = test_df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [119]:
from nltk import sent_tokenize, word_tokenize

In [120]:
train_X['num_sen'] = train_df['text'].apply(lambda x: len(sent_tokenize(x)))
test_X['num_sen'] = test_df['text'].apply(lambda x: len(sent_tokenize(x)))

In [121]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn import naive_bayes

In [122]:
def run_MNB(train_X, train_y, test_X, test_y, test_X2):
    model = OneVsRestClassifier(naive_bayes.MultinomialNB(),  n_jobs = 1)
    model.fit(train_X, train_y)
    pred_test_y = model.predict_proba(test_X)
    pred_test_y2 = model.predict_proba(test_X2)
    return pred_test_y, pred_test_y2, model

In [123]:
import xgboost as xgb

In [124]:
def run_XGB(train_X, train_y, test_X, test_y = None, test_X2 = None, seed_val = 0, child = 1, colsample = 0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1 
    param['max_depth'] = 3
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = 'mlogloss'
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000
    
    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label = train_y)
    
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label = test_y)
        watch_list = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watch_list, early_stopping_rounds = 50, verbose_eval = 20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(xgtrain, xgtest, num_rounds)
    
    pred_test_y = model.predict(xgtest)
    
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2)
    
    return pred_test_y, pred_test_y2, model

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [126]:
n_feature = 20

In [127]:
train_tfidfVectorizer = TfidfVectorizer(stop_words = 'english', ngram_range= (1,3))
tfidf_full = train_tfidfVectorizer.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
tfidf_train = train_tfidfVectorizer.transform(train_df['text'].values.tolist())
tfidf_test = train_tfidfVectorizer.transform(test_df['text'].values.tolist())

In [128]:
kf_model = KFold(n_splits = 5, shuffle = True, random_state = 1)
pred_train = np.zeros([train_df.shape[0], 3])
pred_full_test = 0
for dev_index, val_index in kf_model.split(train_X):
    dev_X, val_X = tfidf_train[dev_index], tfidf_train[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = run_MNB(dev_X, dev_y, val_X, val_y, tfidf_test)
    pred_train[val_index,:] = pred_val_y
    pred_full_test += pred_test_y
pred_full_test /= 5

In [129]:
train_X["nb_cvec_eap0"] = pred_train[:,0]
train_X["nb_cvec_hpl0"] = pred_train[:,1]
train_X["nb_cvec_mws0"] = pred_train[:,2]
test_X["nb_cvec_eap0"] = pred_full_test[:,0]
test_X["nb_cvec_hpl0"] = pred_full_test[:,1]
test_X["nb_cvec_mws0"] = pred_full_test[:,2]

In [130]:
svd_model = TruncatedSVD(n_components=n_feature, algorithm='arpack')
svd_model.fit(tfidf_full)
train_svd = pd.DataFrame(svd_model.transform(tfidf_train))
test_svd = pd.DataFrame(svd_model.transform(tfidf_test))

In [131]:
train_svd.columns = ['svd_char_'+str(i) for i in range(n_feature)]
test_svd.columns = ['svd_char_'+str(i) for i in range(n_feature)]

In [132]:
train_X = pd.concat([train_X, train_svd], axis = 1)
test_X = pd.concat([test_X, test_svd], axis = 1)

In [133]:
del tfidf_full, tfidf_train, tfidf_test, train_svd, test_svd

In [134]:
train_countVectorizer = CountVectorizer(stop_words = 'english', ngram_range = (1, 3))
train_countVectorizer.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
count_train = train_countVectorizer.transform(train_df['text'].values.tolist())
count_test = train_countVectorizer.transform(test_df['text'].values.tolist())

In [135]:
from sklearn.model_selection import KFold
from sklearn import metrics

In [136]:
kf_model = KFold(n_splits = 5, shuffle = True, random_state = 1)
pred_train = np.zeros([train_df.shape[0], 3])
pred_full_test = 0
for dev_index, val_index in kf_model.split(train_X):
    dev_X, val_X = count_train[dev_index], count_train[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = run_MNB(dev_X, dev_y, val_X, val_y, count_test)
    pred_train[val_index,:] = pred_val_y
    pred_full_test += pred_test_y

pred_full_test /= 5

In [137]:
train_X["nb_cvec_eap"] = pred_train[:,0]
train_X["nb_cvec_hpl"] = pred_train[:,1]
train_X["nb_cvec_mws"] = pred_train[:,2]
test_X["nb_cvec_eap"] = pred_full_test[:,0]
test_X["nb_cvec_hpl"] = pred_full_test[:,1]
test_X["nb_cvec_mws"] = pred_full_test[:,2]

In [138]:
del count_train, count_test

In [139]:
train_countVectorizer = CountVectorizer(ngram_range = (1,7), analyzer = 'char')
train_countVectorizer.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
count_train = train_countVectorizer.transform(train_df['text'].values.tolist())
count_test = train_countVectorizer.transform(test_df['text'].values.tolist())

In [140]:
kf_model = KFold(n_splits = 5, shuffle = True, random_state = 1)
pred_train = np.zeros([train_df.shape[0], 3])
pred_full_test = 0
for dev_index, val_index in kf_model.split(train_X):
    dev_X, val_X = count_train[dev_index], count_train[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = run_MNB(dev_X, dev_y, val_X, val_y, count_test)
    pred_train[val_index,:] = pred_val_y
    pred_full_test += pred_test_y

pred_full_test /= 5

In [141]:
train_X["nb_cvec_eap1"] = pred_train[:,0]
train_X["nb_cvec_hpl1"] = pred_train[:,1]
train_X["nb_cvec_mws1"] = pred_train[:,2]
test_X["nb_cvec_eap1"] = pred_full_test[:,0]
test_X["nb_cvec_hpl1"] = pred_full_test[:,1]
test_X["nb_cvec_mws1"] = pred_full_test[:,2]

In [142]:
del count_train, count_test

In [143]:
train_tfidfVectorizer = TfidfVectorizer(ngram_range= (1,5), analyzer = 'char')
tfidf_full = train_tfidfVectorizer.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
tfidf_train = train_tfidfVectorizer.transform(train_df['text'].values.tolist())
tfidf_test = train_tfidfVectorizer.transform(test_df['text'].values.tolist())

In [144]:
kf_model = KFold(n_splits = 5, shuffle = True, random_state = 1)
pred_train = np.zeros([train_df.shape[0], 3])
pred_full_test = 0
for dev_index, val_index in kf_model.split(train_X):
    dev_X, val_X = tfidf_train[dev_index], tfidf_train[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = run_MNB(dev_X, dev_y, val_X, val_y, tfidf_test)
    pred_train[val_index,:] = pred_val_y
    pred_full_test += pred_test_y
pred_full_test /= 5

In [145]:
train_X["nb_cvec_eap2"] = pred_train[:,0]
train_X["nb_cvec_hpl2"] = pred_train[:,1]
train_X["nb_cvec_mws2"] = pred_train[:,2]
test_X["nb_cvec_eap2"] = pred_full_test[:,0]
test_X["nb_cvec_hpl2"] = pred_full_test[:,1]
test_X["nb_cvec_mws2"] = pred_full_test[:,2]

In [146]:
svd_model = TruncatedSVD(n_components=n_feature, algorithm='arpack')
svd_model.fit(tfidf_full)
train_svd1 = pd.DataFrame(svd_model.transform(tfidf_train))
test_svd1 = pd.DataFrame(svd_model.transform(tfidf_test))

In [147]:
train_svd1.columns = ['svd_char1_'+str(i) for i in range(n_feature)]
test_svd1.columns = ['svd_char1_'+str(i) for i in range(n_feature)]

In [148]:
train_X = pd.concat([train_X, train_svd1], axis = 1)
test_X = pd.concat([test_X, test_svd1], axis = 1)

In [149]:
del tfidf_full, tfidf_train, tfidf_test, train_svd1, test_svd1

In [150]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Tokenize sentences
tokenized_sentences_train = [word_tokenize(sentence.lower()) for sentence in train_df['text']]
tokenized_sentences_test = [word_tokenize(sentence.lower()) for sentence in test_df['text']]
# Train Word2Vec model
model_train = Word2Vec(sentences=tokenized_sentences_train, vector_size=100, window=5, min_count=1, workers=4)
model_test = Word2Vec(sentences=tokenized_sentences_test, vector_size=100, window=5, min_count=1, workers=4)

def get_average_vector(tokens, model1):
    vectors = [model1.wv[word] for word in tokens if word in model1.wv]
    if not vectors:
        return np.zeros(model1.vector_size)  # Trả về vector zero nếu không có từ nào
    return np.mean(vectors, axis=0)

# Tạo vector trung bình cho từng câu trong DataFrame
X_train = np.array([get_average_vector(tokens, model_train) for tokens in tokenized_sentences_train])
X_test = np.array([get_average_vector(tokens, model_test) for tokens in tokenized_sentences_test])

# Chuyển đổi sang DataFrame để dễ dàng xử lý
X_df_train = pd.DataFrame(X_train)
X_df_test = pd.DataFrame(X_test)


num_features = X_df_train.shape[1]  # Số lượng đặc trưng (kích thước vector)
column_names = [f'feature_{i}' for i in range(num_features)] 

X_df_train.columns = column_names
X_df_test.columns = column_names

In [151]:
train_X = pd.concat([train_X, X_df_train], axis = 1)
test_X = pd.concat([test_X, X_df_test], axis = 1)

In [153]:
kf = KFold(n_splits=5, shuffle=True, random_state=2017)
pred_train = np.zeros([train_df.shape[0], 3])
pred_y = list()
for dev_index, val_index in kf.split(train_X):
    dev_X, val_X = train_X.loc[dev_index], train_X.loc[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]
    pred_val_y, pred_test_y, model = run_XGB(dev_X, dev_y, val_X, val_y, test_X, seed_val=0, colsample=0.5)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    pred_y.append(pred_test_y)
    break

[0]	train-mlogloss:1.00968	test-mlogloss:1.00968


Parameters: { "silent" } are not used.



[20]	train-mlogloss:0.41436	test-mlogloss:0.41858
[40]	train-mlogloss:0.34193	test-mlogloss:0.35563
[60]	train-mlogloss:0.31592	test-mlogloss:0.33985
[80]	train-mlogloss:0.29857	test-mlogloss:0.33216
[100]	train-mlogloss:0.28434	test-mlogloss:0.32685
[120]	train-mlogloss:0.27249	test-mlogloss:0.32310
[140]	train-mlogloss:0.26211	test-mlogloss:0.32104
[160]	train-mlogloss:0.25237	test-mlogloss:0.32004
[180]	train-mlogloss:0.24356	test-mlogloss:0.31887
[200]	train-mlogloss:0.23559	test-mlogloss:0.31796
[220]	train-mlogloss:0.22850	test-mlogloss:0.31740
[240]	train-mlogloss:0.22070	test-mlogloss:0.31681
[260]	train-mlogloss:0.21389	test-mlogloss:0.31624
[280]	train-mlogloss:0.20745	test-mlogloss:0.31635
[300]	train-mlogloss:0.20130	test-mlogloss:0.31614
[320]	train-mlogloss:0.19526	test-mlogloss:0.31613
[324]	train-mlogloss:0.19400	test-mlogloss:0.31648


In [155]:
pred_y = np.array(pred_y)

In [157]:
pred_y

array([[[0.03625342, 0.01526284, 0.9484837 ],
        [0.98862773, 0.00933318, 0.00203913],
        [0.05990933, 0.93487716, 0.00521353],
        ...,
        [0.97766715, 0.01807267, 0.00426024],
        [0.02161431, 0.00960149, 0.96878415],
        [0.03226269, 0.96593297, 0.00180441]]], dtype=float32)

In [159]:
print(pred_y.shape)


(1, 8392, 3)


In [164]:
output = output = pd.DataFrame({
    'id': test_df['id'], 
    'EAP': pred_y[0, :, 0], 
    'HPL': pred_y[0, :, 1], 
    'MWS': pred_y[0, :, 2]
})


In [166]:
output

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.036253,0.015263,0.948484
1,id24541,0.988628,0.009333,0.002039
2,id00134,0.059909,0.934877,0.005214
3,id27757,0.790470,0.202069,0.007461
4,id04081,0.934357,0.036097,0.029545
...,...,...,...,...
8387,id11749,0.620640,0.022671,0.356689
8388,id10526,0.070961,0.016027,0.913012
8389,id13477,0.977667,0.018073,0.004260
8390,id13761,0.021614,0.009601,0.968784


In [167]:
r = output.to_csv('submission.csv', index = False)