## Text Mining Series 1: Predict Defect Name From Defect Description Text in Chinese by Python

In [47]:
import pandas as pd
import numpy as np

import jieba
jieba.load_userdict('D:/application/R/library/jiebaRD/dict/user.dict.electricity.utf8')

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC  
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, 
                              GradientBoostingClassifier, AdaBoostClassifier)
from sklearn import clone
from sklearn.grid_search import GridSearchCV

import sys
stdout = sys.stdout
reload(sys)
sys.stdout = stdout
sys.setdefaultencoding('utf8')

### 1. Get the data

In [2]:
data = pd.read_csv('defect_name_20161209.csv')  ### utf-8

In [3]:
data.head()

Unnamed: 0,﻿句子编码,原句子,特征词1,特征词2,特征词3,缺陷部位1,缺陷部位2,缺陷部位3
0,1,控制异常/巡视发现牛从甲直流极1换流变 Y/Y A相在线净油装置1电机存在异响，现场检查滤油...,在线净油装置,滤油机,其他,本体,滤油机,其他
1,2,油色谱异常/500kV 站用变油色谱5月10日起无数据更新。/,油色谱,其他,其他,本体,其他,其他
2,3,油色谱异常/500kV 站用变无法读取油色谱数据。/,油色谱,其他,其他,本体,其他,其他
3,4,其他/变电站后台油温二应改为绕组温度，OPEN3000无显示绕组温度（现场本体红外测温温度为...,其他,其他,其他,本体,其他,其他
4,5,其他/12台换流变爬梯“禁止攀爬”挡板均锁不上或未安装。/已将爬梯“禁止攀爬”挡板锁头调整好...,其他,其他,其他,本体,其他,其他


In [4]:
data.columns

Index([u'﻿句子编码', u'原句子', u'特征词1', u'特征词2', u'特征词3', u'缺陷部位1', u'缺陷部位2', u'缺陷部位3'], dtype='object')

In [5]:
data.columns[1]

'\xe5\x8e\x9f\xe5\x8f\xa5\xe5\xad\x90'

In [6]:
print(data.columns[1])

原句子


In [7]:
text = data[data.columns[1]]

### 2. Chinese text segmentation

In [8]:
print(text[0])

控制异常/巡视发现牛从甲直流极1换流变 Y/Y A相在线净油装置1电机存在异响，现场检查滤油机压力和红外测温均未发现异常。/


In [9]:
print(' '.join(jieba.cut(text[0])))

控制 异常 / 巡视 发现 牛 从 甲 直流 极 1 换 流变   Y / Y   A 相 在线净油装置 1 电机 存在 异响 ， 现场 检查 滤油机 压力 和 红外 测温 均 未 发现异常 。 /


In [10]:
text_seg = [' '.join(jieba.cut(x)) for x in text]

In [11]:
len(text_seg)

1661

In [12]:
print(text_seg[0])

控制 异常 / 巡视 发现 牛 从 甲 直流 极 1 换 流变   Y / Y   A 相 在线净油装置 1 电机 存在 异响 ， 现场 检查 滤油机 压力 和 红外 测温 均 未 发现异常 。 /


### 3. Make Term-Document Matrix(X)

In [13]:
vectorizer = CountVectorizer(stop_words = 'english')
vectorizer

CountVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
x = vectorizer.fit_transform(text_seg).toarray()

In [15]:
x.shape

(1661L, 4514L)

### 4. Get responding variable(Y)

In [16]:
y_1 = data[u'缺陷部位1']

In [17]:
y_1.head()

0    本体
1    本体
2    本体
3    本体
4    本体
Name: 缺陷部位1, dtype: object

In [18]:
# transform y to numerical
y_1_temp = pd.DataFrame({'y_1': y_1.unique(), 'y_1_id': np.arange(1, len(y_1.unique()) + 1)})

In [19]:
y_1_temp.shape

(6, 2)

In [20]:
# similar string, one is correct, one has a space in the end
y_1_temp

Unnamed: 0,y_1,y_1_id
0,本体,1
1,套管,2
2,本体,3
3,非电量保护装置,4
4,冷却系统,5
5,调压开关,6


In [21]:
# delete the space in the end of the string
data['y_1'] = pd.Series([i.rstrip() for i in y_1])

In [22]:
# transform y to numerical
y_1_temp = pd.DataFrame({'y_1': data['y_1'].unique(), 'y_1_id': np.arange(1, len(data['y_1'].unique()) + 1)})
y_1_temp

Unnamed: 0,y_1,y_1_id
0,本体,1
1,套管,2
2,非电量保护装置,3
3,冷却系统,4
4,调压开关,5


In [23]:
y_1_value = data.merge(y_1_temp, on = 'y_1', how = 'left')['y_1_id']

In [24]:
y_1_value.head()

0    1
1    1
2    1
3    1
4    1
Name: y_1_id, dtype: int32

In [25]:
y_1_value_list = [i for i in y_1_value.values]

In [26]:
len(y_1_value_list)

1661

### 5. Split data to train and test

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y_1_value_list, test_size = 0.2, random_state = 1234L)

### 6. Model

#### 6.1 Fit Single model --- Logistic Regrssion

In [28]:
clf_logis = LogisticRegression(C = 1, penalty = 'l1')

In [29]:
print(cross_val_score(clf_logis, x_train, y_train, cv = 3))

y_pred = clf_logis.fit(x_train,y_train).predict(x_test)
print(accuracy_score(y_test, y_pred))

[ 0.91910112  0.94117647  0.93877551]
0.948948948949


#### 6.2 Fit Single model --- Naive Bayes

In [30]:
clf_bayes = MultinomialNB()

In [31]:
cross_val_score(clf_bayes, x_train, y_train, cv = 3)

array([ 0.83820225,  0.83031674,  0.85034014])

In [32]:
y_pred = clf_bayes.fit(x_train,y_train).predict(x_test)
accuracy_score(y_test, y_pred)

0.85885885885885882

#### 6.3 Fit Single model --- Ridge Classifier

In [33]:
clf_ridge = RidgeClassifierCV()

In [34]:
cross_val_score(clf_ridge, x_train, y_train, cv = 3)

array([ 0.89213483,  0.90497738,  0.88435374])

In [35]:
clf_ridge.fit(x_train, y_train).score(x_test, y_test)

0.90390390390390385

In [36]:
y_pred = clf_ridge.fit(x_train,y_train).predict(x_test)
accuracy_score(y_test, y_pred)

0.90390390390390385

#### 6.4 Fit Single model --- K Nearest Neighbors (KNN)

In [40]:
clf_knn = KNeighborsClassifier()#default with k=5  

In [41]:
cross_val_score(clf_knn, x_train, y_train, cv = 3)

array([ 0.78651685,  0.75791855,  0.75736961])

In [42]:
clf_knn.fit(x_train, y_train).score(x_test, y_test)

0.80180180180180183

#### 6.5 Fit Single model --- Support Vector Machine (SVM) 

In [43]:
clf_svc = SVC(kernel = 'linear')#default with 'rbf'  

In [44]:
cross_val_score(clf_svc, x_train, y_train, cv = 3)

array([ 0.91685393,  0.91855204,  0.90249433])

In [45]:
clf_svc.fit(x_train, y_train).score(x_test, y_test)

0.93993993993993996

#### 6.6 Fit Ensemble model --- Bagging

In [50]:
clf_bag = BaggingClassifier(KNeighborsClassifier())

In [51]:
cross_val_score(clf_bag, x_train, y_train, cv = 3)

array([ 0.79325843,  0.75339367,  0.75283447])

In [52]:
clf_bag.fit(x_train, y_train).score(x_test, y_test)

0.79879879879879878

**Logistic Regression is best, because it has the biggest accuracy score (0.828) on the test set.**

#### 6.7 Fit Other Ensemble models

In [37]:
models = [RandomForestClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), AdaBoostClassifier()]

In [38]:
for model in models:
    clf = clone(model)
    print('Model',model)
    print('Score', model.fit(x_train, y_train).score(x_test, y_test))
    print

('Model', RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0))
('Score', 0.90090090090090091)

('Model', ExtraTreesClassifier(bootstrap=False, compute_importances=None,
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,
           random_state=None, verbose=0))
('Score', 0.86786786786786785)

('Model', GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=100,
              random_state=None, subsample=1.0, ver

**Gradient Boosting Classifier in ensembel models is best, because it has the biggest accuracy score (0.822) on the test set.**

**Simper is better, so we choose logistic regression as our best model.**

#### 6.8 Tune model

In [46]:
model_logis = GridSearchCV(LogisticRegression(), {'C' : [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}, cv = 3)
model_logis.fit(x_train, y_train)
print("Best parameters set found on development set:")
print(model_logis.best_params_)
print
print("Grid scores on development set:")
for params, mean_score, scores in model_logis.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r"
            % (mean_score, scores.std() * 2, params))
model_logis.score(x_test, y_test)

Best parameters set found on development set:
{'C': 10}

Grid scores on development set:
0.858 (+/-0.018) for {'C': 0.01}
0.890 (+/-0.016) for {'C': 0.05}
0.899 (+/-0.013) for {'C': 0.1}
0.911 (+/-0.011) for {'C': 0.5}
0.915 (+/-0.011) for {'C': 1}
0.919 (+/-0.007) for {'C': 5}
0.921 (+/-0.001) for {'C': 10}


0.9429429429429429