In [1]:
import logging
from collections import defaultdict
import operator
import random

import pandas as pd
import numpy as np
from gensim import models, corpora, similarities
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

import tensorflow as tf

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# data = pd.read_csv("tiantian_news_cutforsearch.csv")
data = pd.read_csv("tiantian_news_cut.csv")

In [3]:
data = data.drop(data.loc[data["cutwords"].isnull()].index)

In [4]:
def get_categories():
    input_file = "conf/categories.txt"
    with open(input_file, "rb") as f:
        result = f.readlines()
        result = [x.strip('\n').decode('utf8') for x in result]
    return result

In [5]:
categories = get_categories()

In [7]:
sample_num = 50
train = pd.DataFrame()
test = pd.DataFrame()

for cat in categories:
    data_onecat = data[data["category"] == cat.encode("utf8")]
    train_onecat = data_onecat.sample(n=sample_num)
    test_onecat = data_onecat.drop(train_onecat.index)
    train = pd.concat([train, train_onecat])
    test = pd.concat([test, test_onecat])
print len(train)
print len(test)

1000
521067


In [8]:
train_documents = train["cutwords"].tolist()
train_texts = [[word for word in document.split()] for document in train_documents]

In [21]:
test = test[:100]
test_documents = test["cutwords"].tolist()
test_texts = [[word for word in document.split()] for document in test_documents]

In [9]:
count_vec = CountVectorizer()
train_documents_counts = count_vec.fit_transform(train_documents)

In [10]:
train_documents_counts.shape

(1000, 55180)

In [11]:
tfidf_transformer = TfidfTransformer()
train_x_tfidf = tfidf_transformer.fit_transform(train_documents_counts)

In [12]:
train_x_tfidf.shape

(1000, 55180)

In [13]:
clf = svm.SVC()

In [14]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [15]:
clean_data = pd.concat([train, test])
clean_data["category"] = clean_data["category"].astype("category")
clean_data["category_encoded"] = clean_data["category"].cat.codes

In [16]:
train = clean_data[:1000]
test = clean_data[1000:]

In [17]:
train_y = train["category_encoded"].values.tolist()

In [18]:
clf.fit(train_x_tfidf, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [22]:
test_documents_counts = count_vec.transform(test_documents)

In [23]:
test_documents_counts.shape

(100, 55180)

In [25]:
test_x_tfidf = tfidf_transformer.transform(test_documents_counts)

In [26]:
test_x_tfidf.shape

(100, 55180)

In [27]:
predict_res = clf.predict(test_x_tfidf)

In [28]:
predict_res

array([1, 1, 4, 6, 1, 1, 4, 1, 1, 6, 1, 1, 4, 1, 4, 1, 6, 1, 1, 1, 1, 4, 1,
       1, 1, 1, 1, 1, 1, 4, 4, 1, 4, 1, 1, 6, 1, 4, 1, 1, 4, 1, 1, 1, 4, 4,
       4, 1, 6, 1, 6, 1, 4, 1, 6, 1, 4, 1, 1, 1, 1, 6, 6, 6, 4, 4, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 6, 1, 4, 1, 7, 1, 4, 6,
       6, 1, 1, 6, 1, 1, 6, 1])

In [29]:
test_y = test["category_encoded"].values.tolist()

In [33]:
correct_num = 0
for i in range(len(predict_res)):
    if predict_res[i] == test_y[i]:
        correct_num += 1
print float(correct_num) * 100 / len(predict_res)

64.0
