In [3]:
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

if __name__ == '__main__':
    train_df = pd.read_csv('C:/dg/data/train_set.csv')
    test_df = pd.read_csv('C:/dg/data/test_set.csv')

    train_char = train_df['article'].values.tolist()
    train_word = train_df['word_seg'].values.tolist()
    train_label = train_df['class'].values
    test_char = test_df['article'].values.tolist()
    test_word = test_df['word_seg'].values.tolist()

    num_label = len(set(train_label))
    print(f'# of labels: {num_label}')

    train_char_len = [len(chars.split()) for chars in train_char]
    train_word_len = [len(words.split()) for words in train_word]
    print('Training set')
    print(np.percentile(train_word_len, [0, 50, 80, 90, 95, 98, 100]))
    print(np.percentile(train_char_len, [0, 50, 80, 90, 95, 98, 100]))

    test_char_len = [len(chars.split()) for chars in test_char]
    test_word_len = [len(words.split()) for words in test_word]
    print('Test set')
    print(np.percentile(test_word_len, [0, 50, 80, 90, 95, 98, 100]))
    print(np.percentile(test_char_len, [0, 50, 80, 90, 95, 98, 100]))


# of labels: 19
Training set
[    6.     514.     990.    1428.    1949.    2858.48 39759.  ]
[   50.     842.    1618.    2346.    3201.    4720.96 55804.  ]
Test set
[    6.   516.   992.  1429.  1949.  2826. 19755.]
[   50.   842.  1621.  2349.  3207.  4672. 31694.]


In [4]:
from collections import Counter

# from gensim.models import Word2Vec
import pandas as pd

if __name__ == '__main__':
    train_df = pd.read_csv('C:/dg/data/train_set.csv')
    test_df = pd.read_csv('C:/dg/data/test_set.csv')

    train_words = train_df['word_seg'].values.tolist()
    test_words = test_df['word_seg'].values.tolist()
    all_words = train_words + test_words

    train_chars = train_df['article'].values.tolist()
    test_chars = test_df['article'].values.tolist()
    all_chars = train_chars + test_chars

    with open('C:/dg/data/all_words.txt', 'w') as f:
        for text in all_words:
            f.write(f'{text}\n')

    with open('C:/dg/data/all_chars.txt', 'w') as f:
        for text in all_chars:
            f.write(f'{text}\n')

In [None]:
import pickle as pkl

import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

SEED = 2018
np.random.seed(SEED)

if __name__ == '__main__':
    train_df = pd.read_csv('C:/dg/data/train_set.csv')
    test_df = pd.read_csv('C:/dg/data/test_set.csv')

    train_char = train_df['article'].values.tolist()
    train_word = train_df['word_seg'].values.tolist()
    train_label = train_df['class'].values - 1
    test_char = test_df['article'].values.tolist()
    test_word = test_df['word_seg'].values.tolist()

    word_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, sublinear_tf=True)
    train_word_feat = word_vectorizer.fit_transform(train_word)
    test_word_feat = word_vectorizer.transform(test_word)

    svd = TruncatedSVD(n_components=100, n_iter=20, random_state=SEED)
    train_svd_feat = svd.fit_transform(train_word_feat)
    print('Training set transformed..')
    with open('C:/dg/data/train_svd_feat.pkl', 'wb') as f:
        pkl.dump(train_svd_feat, f)

    test_svd_feat = svd.transform(test_word_feat)
    print('Test set transformed..')
    with open('C:/dg/data/test_svd_feat.pkl', 'wb') as f:
        pkl.dump(test_svd_feat, f)

In [None]:
import pickle as pkl

import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

if __name__ == '__main__':

    with open('C:/dg/data/train_svd_feat.pkl', 'rb') as f:
        train_data = pkl.load(f)
    with open('C:/dg/data/test_svd_feat.pkl', 'rb') as f:
        test_data = pkl.load(f)
    train_label = np.load('../../data/label.npy')

    num_classes = len(set(train_label))

    num_fold = 10
    fold_len = train_data.shape[0] // num_fold

    skf_indices = []
    skf = StratifiedKFold(n_splits=num_fold, shuffle=True, random_state=2018)
    for i, (train_idx, valid_idx) in enumerate(skf.split(np.ones(train_data.shape[0]), train_label)):
        skf_indices.extend(valid_idx.tolist())

    train_pred = np.zeros((train_data.shape[0], num_classes))
    test_pred = np.zeros((test_data.shape[0], num_classes))

    for fold in range(num_fold):

        print(f'Processing fold {fold}...')

        fold_start = fold * fold_len
        fold_end = (fold + 1) * fold_len
        if fold == num_fold - 1:
            fold_end = len(skf_indices)

        train_indices = skf_indices[:fold_start] + skf_indices[fold_end:]
        test_indices = skf_indices[fold_start:fold_end]

        train_x, test_x = train_data[train_indices], train_data[test_indices]
        train_y = train_label[train_indices]

        clf = LGBMClassifier(n_estimators=1000)
        clf.fit(train_x, train_y)
        pred = clf.predict_proba(test_x)
        train_pred[test_indices] = pred
        pred = clf.predict_proba(test_data)
        test_pred += pred / num_fold

    y_pred = np.argmax(train_pred, axis=1)
    score = f1_score(train_label, y_pred, average='macro')
    print(score)

    np.save(f'../../oof_pred/_lgbm_svd_train_{score:.4f}', train_pred)
    np.save(f'../../oof_pred/_lgbm_svd_test_{score:.4f}', test_pred)
