In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score
from sklearn.model_selection import train_test_split,GridSearchCV
import jieba
import pandas as pd
import numpy as np

## 分词

In [60]:
stop_words = open('../data/stop_words.txt','r',encoding='utf-8').read().split('\n')
def word_seg(content):
    return [word for word in jieba.cut(content) if word not in stop_words and word!=' ']


In [67]:
# 内容分词
train_data=pd.read_csv('../data/train.csv')
# 去除重复的ID
train_data=train_data.drop_duplicates(subset=['content_id'],keep='first') # 去除重复id的数据
train_data['word_seg']=train_data['content'].apply(lambda x:" ".join(word_seg(x)))
print(train_data['sentiment_value'].value_counts())

# sent_copy=train_data[train_data['sentiment_value'].isin([1,-1])]
# train_data=pd.concat([train_data,sent_copy],axis=0)
# train_data=pd.concat([train_data,sent_copy],axis=0)
# train_data=pd.concat([train_data,sent_copy],axis=0)


 0    5836
-1    1275
 1    1179
Name: sentiment_value, dtype: int64


 我们可以发现类别不均衡 0：-1:1的比例将近为：5:1:1,但是如果像上面直接简单的copy是不可以的，会导致模型过拟合 

## 提取tfidf特征

In [62]:
vec = TfidfVectorizer(ngram_range=(1,3),min_df=1, max_df=0.9,use_idf=True,smooth_idf=True, sublinear_tf=True)
X_train_feature=vec.fit_transform(train_data['word_seg'])

## 模型训练

In [63]:
y_train_sent=train_data['sentiment_value'].astype(int)
X_train_sent,X_test_sent,y_train_sent,y_test_sent=\
    train_test_split(X_train_feature,y_train_sent,test_size=0.1,random_state=42)
# clf = LogisticRegression(C=4, dual=True)
# clf =svm.LinearSVC()
# clf =RandomForestClassifier()
clf =SGDClassifier(n_iter=80)
# tune_params(X_train_sent,y_train_sent)
clf.fit(X_train_sent, y_train_sent)




SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=80,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

##  过采样

In [111]:
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter
X_resampled_smote, y_resampled_smote = SMOTE().fit_sample(X_train_sent,y_train_sent) 
print(sorted(Counter(y_resampled_smote).items()))

[(-1, 5285), (0, 5285), (1, 5285)]


In [112]:
clf =SGDClassifier(n_iter=80)
clf.fit(X_resampled_smote, y_resampled_smote)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=80,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

## 在测试集评估模型

In [113]:
pred_test_sent=clf.predict(X_test_sent)
# 精确度=真阳性/（真阳性+假阳性）
precision=precision_score(y_test_sent,pred_test_sent,pos_label=None,average='weighted')
# 召回率=真阳性/（真阳性+假阴性）
recall=recall_score(y_test_sent,pred_test_sent,pos_label=None,average='weighted')
# F1
f1=f1_score(y_test_sent,pred_test_sent,pos_label=None,average='weighted')
# 精确率
accuracy=accuracy_score(y_test_sent,pred_test_sent)
print("precision:{:.4f}-recall:{:.4f}-f1:{:.4f}-accuracy:{:.4f}".format(precision,recall,f1,accuracy))

precision:0.6622-recall:0.6876-f1:0.6210-accuracy:0.6876


In [114]:
print(pred_test_sent.shape)
print(y_test_sent.value_counts())
print("真实情感为1或者-1的，预测正确的数量：")
_num=0
for pred_test,y_test in zip(list(pred_test_sent),list(y_test_sent)):
    if y_test==pred_test and y_test in [1,-1]:
        _num+=1
#         print(pred_test,y_test)
print(_num)


print("真实情感为1或者-1的，预测错误的数量：")
_num=0
for pred_test,y_test in zip(list(pred_test_sent),list(y_test_sent)):
    if y_test!=pred_test and y_test in [1,-1]:
        _num+=1
#         print(pred_test,y_test)
print(_num)

(829,)
 0    551
-1    153
 1    125
Name: sentiment_value, dtype: int64
真实情感为1或者-1的，预测正确的数量：
47
真实情感为1或者-1的，预测错误的数量：
231


In [115]:
# 预测结果
submit_data=pd.read_csv('../data/test_public.csv')
submit_data['word_seg']=submit_data['content'].apply(lambda x:" ".join(word_seg(x)))
X_test_feature=vec.transform(submit_data['word_seg'])
sent_preds=clf.predict(X_test_feature)
sent_preds

array([1, 0, 0, ..., 0, 0, 0])