In [3]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

In [4]:
## 设置字符集，防止中文乱码
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False

In [5]:
# jupyter展示图片，非内嵌显示
# tk: 显示出来，inline：内嵌显示，默认为inline
%matplotlib tk

In [6]:
# 1. 文件数据读取
df = pd.read_csv('../data/result_process02', sep=',')
df.head(10)

Unnamed: 0,label,has_date,jieba_cut_content,content_sema
0,1.0,1,非 财务 纠淼 牟 莆 窆 芾 - （ 沙盘 模拟 ） - - ...,8.456151
1,0.0,1,讲 的 是 孔子 后人 的 故事 。 一个 老 领导 回到 家乡 ...,7.486084
2,1.0,1,尊敬 的 贵 公司 ( 财务 / 经理 ) 负责人 您好 ！ 我 ...,7.175171
3,1.0,1,贵 公司 负责人 ( 经理 / 财务 ） 您好 ： 深圳市 华龙 公...,7.565682
4,1.0,1,这是 一封 HTML 格式 信件 ！ - - - - - - - ...,2.063409
5,1.0,1,TO ： 贵 公司 经理 、 财务 您好 ！ 深圳市 春洋 贸易 有...,7.143747
6,0.0,1,那 他 为什么 不 愿意 起诉 ， 既然 这样 了 ！ 起诉 后 ...,4.807568
7,1.0,1,尊敬 的 负责人 （ 经理 ／ 财务 ） ： 您好 ！ 我 是 深...,6.593684
8,1.0,1,您好 以下 是 特别 为 阁下 发 的 香港 信息 ( 图片 ...,7.611074
9,0.0,1,我 觉得 ， 负债 不要紧 ， 最 重要 的 是 能 负得起 这个 ...,7.04134


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64620 entries, 0 to 64619
Data columns (total 4 columns):
label                64619 non-null float64
has_date             64620 non-null int64
jieba_cut_content    64284 non-null object
content_sema         64620 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 2.0+ MB


In [20]:
# 2. 数据分割
x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content','content_sema']], df['label'], test_size=0.2, random_state=0)
print("训练数据集大小:%d" % x_train.shape[0])
print("测试数据集大小:%d" % x_test.shape[0])

训练数据集大小:51696
测试数据集大小:12924


In [21]:
x_train.head(5)

Unnamed: 0,has_date,jieba_cut_content,content_sema
42325,1,感谢您 打开 本 公司 邮件 ， 祝您 工作 顺利 ， 生意兴隆 ！ ...,3.798726
14673,1,他 结婚 前 那种 口气 和 呢 爸爸 说话 的 时候 ， 呢 酒...,7.580747
49644,1,我 和 gg 想 十一 去 泰山 ， 曲阜 等 地 ， 一共 三天...,5.220846
62007,1,cissp ： 您好 ！ 从 网上 看到 您 的 邮址 ， 冒昧 给...,7.231017
27725,1,KR - PC 和 G 型 温湿度 遥控 系统 是 广州 庆瑞 电子...,2.597746


In [23]:
# 3. 开始模型训练
# 3.1 特征工程，将文本数据转换为数值型数据
vectorizer = CountVectorizer()
transfromer = TfidfTransformer(norm='l2', use_idf=True)
svd = TruncatedSVD(n_components=20)
jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
vectorizer_model = vectorizer.fit(jieba_cut_content)
df0 = vectorizer_model.transform(jieba_cut_content)
transfromer_model = transfromer.fit(df0)
df1 = transfromer_model.transform(df0)
svd_model = svd.fit(df1)
df2 = svd_model.transform(df1)
data = pd.DataFrame(df2)
print(data.head(5))
print(data.info())

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [25]:
# 3.2 数据合并
data['has_date'] = x_train['has_date'].astype("float64")
data['content_sema'] = x_train['content_sema']
print(data.head(5))
print(data.info())

          0         1         2         3         4         5         6  \
0  0.580972  0.023836  0.004217 -0.172864 -0.050395 -0.031510  0.265910   
1  0.009848  0.104202  0.119389  0.079951 -0.097514 -0.089965  0.001998   
2  0.011713  0.030442  0.019109  0.006414 -0.004962  0.002871  0.006360   
3  0.053513  0.120704  0.069297 -0.030864  0.037340  0.013360  0.026196   
4  0.015687  0.060261  0.017480 -0.021015  0.014019  0.046781  0.003174   

          7         8         9      ...             12        13        14  \
0  0.001207  0.000653  0.009725      ...      -0.032728 -0.000214 -0.063970   
1  0.001838 -0.002683 -0.053779      ...       0.004626  0.000593 -0.006881   
2  0.006704  0.000361  0.001722      ...       0.002537  0.000121  0.000670   
3  0.027286  0.005593  0.085179      ...      -0.016109  0.002305  0.048134   
4  0.030464  0.001531  0.024273      ...       0.049679  0.000728  0.020931   

         15        16        17        18        19  has_date  content_sem

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,has_date,content_sema
0,0.580972,0.023836,0.004217,-0.172864,-0.050395,-0.03151,0.26591,0.001207,0.000653,0.009725,...,-0.032728,-0.000214,-0.06397,-0.001014,-0.017425,-0.057294,-0.000741,-0.007027,1.0,8.456151
1,0.009848,0.104202,0.119389,0.079951,-0.097514,-0.089965,0.001998,0.001838,-0.002683,-0.053779,...,0.004626,0.000593,-0.006881,0.000569,-0.002701,-0.021346,-0.004816,-0.005994,1.0,7.486084
2,0.011713,0.030442,0.019109,0.006414,-0.004962,0.002871,0.00636,0.006704,0.000361,0.001722,...,0.002537,0.000121,0.00067,0.002688,-0.002001,-0.008683,-0.004944,0.013858,1.0,7.175171
3,0.053513,0.120704,0.069297,-0.030864,0.03734,0.01336,0.026196,0.027286,0.005593,0.085179,...,-0.016109,0.002305,0.048134,-0.026402,0.009411,-0.084547,-0.099804,-0.081314,,
4,0.015687,0.060261,0.01748,-0.021015,0.014019,0.046781,0.003174,0.030464,0.001531,0.024273,...,0.049679,0.000728,0.020931,-0.014137,-0.016058,-0.010756,-0.026628,-0.039425,1.0,2.063409
