In [50]:
import logging
from collections import defaultdict
import operator
import random

import pandas as pd
import numpy as np
from gensim import models, corpora, similarities
from sklearn import svm
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier

from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib

import tensorflow as tf

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
# data = pd.read_csv("tiantian_news_cutforsearch.csv")
data = pd.read_csv("raw_data/tiantian_news_cut.csv")

In [5]:
data = data.drop(data.loc[data["cutwords"].isnull()].index)

In [6]:
def get_categories():
    input_file = "conf/categories.txt"
    with open(input_file, "rb") as f:
        result = f.readlines()
        result = [x.strip('\n').decode('utf8') for x in result]
    return result

In [7]:
categories = get_categories()

In [8]:
sample_num = 5000
train = pd.DataFrame()
test = pd.DataFrame()

for cat in categories:
    data_onecat = data[data["category"] == cat.encode("utf8")]
    train_onecat = data_onecat.sample(n=sample_num)
    test_onecat = data_onecat.drop(train_onecat.index)
    train = pd.concat([train, train_onecat])
    test = pd.concat([test, test_onecat])
# test = test.sample(n=10000)
print len(train)
print len(test)

100000
422067


In [9]:
train_documents = train["cutwords"].tolist()
train_texts = [[word for word in document.split()] for document in train_documents]

In [10]:
test_documents = test["cutwords"].tolist()
test_texts = [[word for word in document.split()] for document in test_documents]

In [11]:
count_vec = CountVectorizer()
train_documents_counts = count_vec.fit_transform(train_documents)

In [12]:
train_documents_counts.shape

(100000, 722913)

In [13]:
tfidf_transformer = TfidfTransformer()
train_x_tfidf = tfidf_transformer.fit_transform(train_documents_counts)

In [14]:
train_x_tfidf.shape

(100000, 722913)

In [15]:
test_documents_counts = count_vec.transform(test_documents)

In [16]:
test_x_tfidf = tfidf_transformer.transform(test_documents_counts)

In [17]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [17]:
clean_data = pd.concat([train, test])
clean_data["category"] = clean_data["category"].astype("category")
clean_data["category_encoded"] = clean_data["category"].cat.codes

In [19]:
train = clean_data[:sample_num * len(categories)]
test = clean_data[sample_num * len(categories):]

In [20]:
print len(train)
print len(test)
print train_x_tfidf.shape

100000
422067
(100000, 722913)


In [21]:
train_y = train["category_encoded"].values.tolist()

In [22]:
test_y = test["category_encoded"].values.tolist()

In [23]:
# clf_svm_1 = svm.SVC(C=100)
# clf_svm_1.fit(train_x_tfidf, train_y)

In [40]:
# clf_svm_2 = svm.SVC(C=10)
# clf_svm_2.fit(train_x_tfidf, train_y)

In [46]:
# clf_lr_1 = linear_model.LogisticRegression(C=1, solver="sag", multi_class="multinomial", verbose=100, n_jobs=-1)
# clf_lr_1.fit(train_x_tfidf, train_y)

In [58]:
clf_lr_1 = MLPClassifier(hidden_layer_sizes=(100, 2), learning_rate_init=0.01, verbose=True, early_stopping=True)
clf_lr_1.fit(train_x_tfidf, train_y)

Iteration 1, loss = 1.85381794


Validation score: 0.615400


Iteration 2, loss = 1.05445769


Validation score: 0.745600


Iteration 3, loss = 0.71809940


Validation score: 0.742900


Iteration 4, loss = 0.60121207


Validation score: 0.744500


Iteration 5, loss = 0.55214013


Validation score: 0.753000


Iteration 6, loss = 0.51895668


Validation score: 0.753300


Iteration 7, loss = 0.48503793


Validation score: 0.746000


Iteration 8, loss = 0.46536983


Validation score: 0.751700


Iteration 9, loss = 0.45474121


Validation score: 0.742400
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100, 2), learning_rate='constant',
       learning_rate_init=0.01, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=True, warm_start=False)

In [26]:
# joblib.dump(clf_svm_1, "svm_1_allfeas.pkl")
# joblib.dump(clf_svm_2, "svm_2_allfeas.pkl")
# joblib.dump(clf_lr_1, "lr_1_allfeas.pkl")
# joblib.dump(clf_lr_2, "lr_2_allfeas.pkl")

In [59]:
correct_num_1 = 0
correct_num_2 = 0
with open("logs/run_2.log", "ab") as f:
    f.write("lr\n")
for i in range(test_x_tfidf.shape[0]):
    predict_1 = clf_lr_1.predict(test_x_tfidf[i])
    # predict_2 = clf_lr_2.predict(test_x_tfidf[i])
    if predict_1[0] == test_y[i]:
        correct_num_1 += 1
    # if predict_2[0] == test_y[i]:
    #     correct_num_2 += 1
    if i % 100 == 0:
        with open("logs/run_2.log", "ab") as f:
            f.write("{}, {}, {}\n".format(i, correct_num_1, correct_num_2))
print float(correct_num_1) * 100 / test_x_tfidf.shape[0]
print float(correct_num_2) * 100 / test_x_tfidf.shape[0]

In [61]:
print float(correct_num_1) * 100 / test_x_tfidf.shape[0]
print float(correct_num_2) * 100 / test_x_tfidf.shape[0]

76.5868926024
0.0


In [None]:
# correct_num_1 = 0
# correct_num_2 = 0
# with open("logs/run_2.log", "ab") as f:
#     f.write("svm\n")
# for i in range(test_x_tfidf.shape[0]):
#     predict_1 = clf_svm_1.predict(test_x_tfidf[i])
#     predict_2 = clf_svm_2.predict(test_x_tfidf[i])
#     if predict_1[0] == test_y[i]:
#         correct_num_1 += 1
#     if predict_2[0] == test_y[i]:
#         correct_num_2 += 1
#     if i % 100 == 0:
#         with open("logs/run_2.log", "ab") as f:
#             f.write("{}, {}, {}\n".format(i, correct_num_1, correct_num_2))
# print float(correct_num_1) * 100 / test_x_tfidf.shape[0]
# print float(correct_num_2) * 100 / test_x_tfidf.shape[0]