### Accusation model using TFIDF

In [1]:
import jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression



### Data Loading 

In [2]:
df_train = pd.read_csv("../Data/sample/data_train_tokenized.csv")
df_test = pd.read_csv("../Data/sample/data_test_tokenized.csv")

df_train.shape, df_test.shape

((154592, 9), (32508, 9))

In [3]:
accusations = ['盗窃', '走私、贩卖、运输、制造毒品', '故意伤害']
df_train = df_train[df_train['accusation'].isin(accusations)]
df_test = df_test[df_test['accusation'].isin(accusations)]

df_train.shape, df_test.shape

((8339, 9), (4540, 9))

In [4]:
df_train.accusation.value_counts()

走私、贩卖、运输、制造毒品    4274
盗窃               2278
故意伤害             1787
Name: accusation, dtype: int64

In [5]:
df_test.accusation.value_counts()

盗窃               1800
走私、贩卖、运输、制造毒品    1597
故意伤害             1143
Name: accusation, dtype: int64

In [7]:
tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=0.8)
%time train_X = tfidf_vectorizer.fit_transform(df_train.fact.values)

CPU times: user 905 ms, sys: 32.4 ms, total: 937 ms
Wall time: 990 ms


In [8]:
classifier = LogisticRegression()
%time classifier.fit(train_X, df_train.accusation.values)

CPU times: user 240 ms, sys: 15.6 ms, total: 256 ms
Wall time: 270 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
# test
test_X = tfidf_vectorizer.transform(df_test.fact.values)
pred = classifier.predict(test_X)

print ("Accuracy: ", accuracy_score(df_test.accusation.values, pred))
confusion_matrix(df_test.accusation.values, pred)

Accuracy:  0.9969162995594714


array([[1143,    0,    0],
       [   4, 1788,    8],
       [   0,    2, 1595]])

### Important words

In [None]:
feature_names = np.array(tfidf_vectorizer.get_feature_names())

def find_important_words(coef, top_n=50):
    sorted_idx = np.argsort(np.abs(coef))[::-1]
    for token in feature_names[sorted_idx[:top_n]]:
        print (token)

In [None]:
for c, coef in enumerate(classifier.coef_):
    print ("------------------------")
    print ("       Class {}     ".format(c))
    print (find_important_words(coef))