# 任务一：基于机器学习的文本分类
## 实现基于logistic/softmax regression的文本分类

## 1.参考

    文本分类
    《神经网络与深度学习》 第2/3章
## 2.数据集：Classify the sentiment of sentences from the Rotten Tomatoes dataset-best acc score: 0.76526

## 3.实现要求：NumPy

## 4.需要了解的知识点：

    ### 文本特征表示：Bag-of-Word，N-gram
    ### 分类器：logistic/softmax regression，损失函数、（随机）梯度下降、特征选择
    ### 数据集：训练集/验证集/测试集的划分
## 5.实验：

    分析不同的特征、损失函数、学习率对最终分类性能的影响
    shuffle 、batch、mini-batch
## 6.时间：两周
-------------------
使用numpy导入文件参考
https://numpy.org/devdocs/reference/generated/numpy.loadtxt.html?highlight=loadtxt#numpy.loadtxt

https://www.runoob.com/numpy/numpy-dtype.html

https://blog.csdn.net/qq_38634140/article/details/88650519

https://blog.csdn.net/messi_james/article/details/80487389
np.set_printoptions(suppress=True) #取消默认的科学计数法
#这里的skiprows是指跳过前1行, 如果设置skiprows=2, 就会跳过前两行
#Python默认读取的数字的数据类型为双精度浮点数
#comment的是指, 如果行的开头为‘#’就会跳过该行
#usecols是指只使用0,2两列。usecols=(0,1,2,3,4)
x = np.loadtxt("./sentiment-analysis-on-movie-reviews/train.tsv", dtype=np.dtype([('id','S20'), ('txt','S1000'), ('label','S20')]), delimiter='\t', skiprows=0, usecols=(1, 2, 3), unpack=False)

--------------------
使用numpy导入文件太费劲了，使用pandas代替。

* 词袋模型 https://blog.csdn.net/hao5335156/article/details/80615057
* Jack Cui 机器学习 https://cuijiahua.com/blog/2017/11/ml_6_logistic_1.html
* https://gitbook.cn/gitchat/column/5cd016a4e30c87051ad2be27/topic/5cd0eb00e30c87051ad2d2b7

In [1]:
import numpy as np, pandas as pd

In [2]:
train = pd.read_csv('./sentiment-analysis-on-movie-reviews/train.tsv', sep='\t')
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
test = pd.read_csv('./sentiment-analysis-on-movie-reviews/test.tsv', sep='\t')
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [4]:
print('train shape: {} & test shape: {}'.format(train.shape, test.shape))

train shape: (156060, 4) & test shape: (66292, 3)


## 借助sklearn划分数据集

In [5]:
from sklearn.model_selection import train_test_split
train_texts = list(train['Phrase'].values)
train_labels = train['Sentiment'].values
test_texts = list(test['Phrase'].values)
x_train, x_valid, y_train, y_valid = train_test_split(train_texts, train_labels, test_size=0.2)

In [6]:
def sentences_to_bag_of_words(all_sentences):
    vocabSet = []
    for sentence in all_sentences:
        for word in sentence.split():
            if word not in vocabSet:
                vocabSet.append(word)
    return vocabSet

In [7]:
def text_to_vector(bow, sentences):
    res = []
    for sentence in sentences:
        feature = [0] * len(bow)
        for word in sentence.split():
            if word in bow:
                feature[bow.index(word)] = 1
        res.append(feature)
    return res

In [8]:
all_text = list(train['Phrase'].values) + list(test['Phrase'].values)
print('all_text shape: {} & all_text0: {}'.format(len(all_text), all_text[0]))

all_text shape: 222352 & all_text0: A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .


In [9]:
%%time
bow = sentences_to_bag_of_words(all_text)
print('bow list longth', len(bow)) # 21637

bow list longth 21637
Wall time: 38.9 s


## 词表的长度是21637，若使用全部单词特征，稀疏矩阵为156060 * 21637维，使用卡方特征选择1000个最好特征单词。

In [10]:
class ChiSquare:
    def __init__(self, doc_list, doc_labels):
        self.total_data, self.total_pos_data, self.total_neg_data = {}, {}, {}
        for i, doc in enumerate(doc_list):
            if doc_labels[i] == 1:
                for word in doc.split():
                    self.total_pos_data[word] = self.total_pos_data.get(word, 0) + 1
                    self.total_data[word] = self.total_data.get(word, 0) + 1
            else:
                for word in doc.split():
                    self.total_neg_data[word] = self.total_neg_data.get(word, 0) + 1
                    self.total_data[word] = self.total_data.get(word, 0) + 1

        total_freq = sum(self.total_data.values())
        total_pos_freq = sum(self.total_pos_data.values())
        # total_neg_freq = sum(self.total_neg_data.values())

        self.words = {}
        for word, freq in self.total_data.items():
            pos_score = self.__calculate(self.total_pos_data.get(word, 0), freq, total_pos_freq, total_freq)
            # neg_score = self.__calculate(self.total_neg_data.get(word, 0), freq, total_neg_freq, total_freq)
            self.words[word] = pos_score * 2

    @staticmethod
    def __calculate(n_ii, n_ix, n_xi, n_xx):
        n_ii = n_ii
        n_io = n_xi - n_ii
        n_oi = n_ix - n_ii
        n_oo = n_xx - n_ii - n_oi - n_io
        return n_xx * (float((n_ii*n_oo - n_io*n_oi)**2) /
                       ((n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)))

    def best_words(self, num, need_score=False):
        words = sorted(self.words.items(), key=lambda word_pair: word_pair[1], reverse=True)
        if need_score:
            return [word for word in words[:num]]
        else:
            return [word[0] for word in words[:num]]

In [11]:
k = 1000
fe = ChiSquare(train_texts, train_labels)
best_words = fe.best_words(k)

In [12]:
%%time
train_bow_feature = text_to_vector(best_words, x_train) # 将best_words换成bog会爆内存
print(train_bow_feature[0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [13]:
valid_bow_feature = text_to_vector(best_words, x_valid)
test_bow_feature = text_to_vector(best_words, test_texts)

In [15]:
from sklearn.utils import shuffle
train_bow_feature, y_train=shuffle(train_bow_feature, y_train)

## 使用逻辑回归算法

In [16]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=2019, 
                         solver='sag', #优化算法：liblinear、lbfgs、newton-cg、sag
                         multi_class='multinomial' #分类方式：multinomial、ovr
)

In [17]:
%%time
clf.fit(train_bow_feature, y_train)

Wall time: 1min 22s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=2019, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
predict = clf.predict(valid_bow_feature)
print(np.mean(predict == y_valid))

0.5963091118800461


In [21]:
res = clf.predict(test_bow_feature)
submission = pd.read_csv('./sentiment-analysis-on-movie-reviews/sampleSubmission.csv')
submission['Sentiment'] = res
submission.to_csv('./submission.csv', index=None)

## 使用词频TF特征

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
# 提取文本计数特征 -- 每个单词的数量
# 对文本的单词进行计数，包括文本的预处理, 分词以及过滤停用词
train_texts = list(train['Phrase'].values)
train_labels = train['Sentiment'].values
test_texts = list(test['Phrase'].values)
x_train, x_valid, y_train, y_valid = train_test_split(train_texts, train_labels, test_size=0.2)

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_valid_counts = count_vect.transform(x_valid)
print(x_train_counts.shape, x_valid_counts.shape)  # (93636, 15188) (31212, 15188)  矩阵(句子-词汇）的维度，词表大小15188
# 在词汇表中一个单词的索引值对应的是该单词在整个训练的文集中出现的频率。
# print(count_vect.vocabulary_.get(u'good'))    #5812     count_vect.vocabulary_是一个词典：word-id
x_train_counts, y_train=shuffle(x_train_counts, y_train)
clf = LogisticRegression(random_state=2019, solver='saga',  # 优化算法：liblinear、lbfgs、newton-cg、sag
                             multi_class='multinomial',  # 分类方式：multinomial、ovr
                             max_iter=1000).fit(x_train_counts, y_train)
predict = clf.predict(x_valid_counts)
print(np.mean(predict == y_valid))

(124848, 15230) (31212, 15230)
0.6563821607074202


## 使用词频TF-IDF特征

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 提取TF-IDF特征 -- word级别的TF-IDF
# 将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。
train_texts = list(train['Phrase'].values)
train_labels = train['Sentiment'].values
test_texts = list(test['Phrase'].values)
x_train, x_valid, y_train, y_valid = train_test_split(train_texts, train_labels, test_size=0.2)

tfidf_transformer = TfidfVectorizer(analyzer='word', max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_word = tfidf_transformer.transform(x_train)
x_valid_tfidf_word = tfidf_transformer.transform(x_valid)
print(x_train_tfidf_word.shape, x_valid_tfidf_word.shape)
x_train_tfidf_word, y_train=shuffle(x_train_tfidf_word, y_train)
clf = LogisticRegression(random_state=2019, solver='saga',  # 优化算法：liblinear、lbfgs、newton-cg、sag
                             multi_class='multinomial',  # 分类方式：multinomial、ovr
                             max_iter=1000).fit(x_train_tfidf_word, y_train)
predict = clf.predict(x_valid_tfidf_word)
print(np.mean(predict == y_valid))

(124848, 15232) (31212, 15232)
0.6346917852108164


## 使用词频3-gram TF-IDF特征

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 提取TF-IDF特征 - ngram级别的TF-IDF
# 将各文档中每个单词的出现次数除以该文档中所有单词的总数：这些新的特征称之为词频tf。
train_texts = list(train['Phrase'].values)
train_labels = train['Sentiment'].values
test_texts = list(test['Phrase'].values)
x_train, x_valid, y_train, y_valid = train_test_split(train_texts, train_labels, test_size=0.2)

tfidf_transformer = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_ngram = tfidf_transformer.transform(x_train)
x_valid_tfidf_ngram = tfidf_transformer.transform(x_valid)
print(x_train_tfidf_ngram.shape, x_valid_tfidf_ngram.shape)
x_train_tfidf_ngram, y_train=shuffle(x_train_tfidf_ngram, y_train)
clf = LogisticRegression(random_state=2019, solver='saga',  # 优化算法：liblinear、lbfgs、newton-cg、sag
                             multi_class='multinomial',  # 分类方式：multinomial、ovr
                             max_iter=1000).fit(x_train_tfidf_ngram, y_train)
predict = clf.predict(x_valid_tfidf_ngram)
print(np.mean(predict == y_valid))

(124848, 50000) (31212, 50000)
0.5996091246956299


## 使用组合特征

In [29]:
%% time
from scipy.sparse import hstack

train_texts = list(train['Phrase'].values)
train_labels = train['Sentiment'].values
test_texts = list(test['Phrase'].values)
x_train, x_valid, y_train, y_valid = train_test_split(train_texts, train_labels, test_size=0.2)

train_bow_feature = text_to_vector(best_words, x_train)
valid_bow_feature = text_to_vector(best_words, x_valid)
test_bow_feature = text_to_vector(best_words, test_texts)

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_valid_counts = count_vect.transform(x_valid)
x_test_counts = count_vect.transform(test_texts)

tfidf_transformer = TfidfVectorizer(analyzer='word', max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_word = tfidf_transformer.transform(x_train)
x_valid_tfidf_word = tfidf_transformer.transform(x_valid)
x_test_tfidf_word = tfidf_transformer.transform(test_texts)

tfidf_transformer = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), max_features=50000)
tfidf_transformer.fit(x_train)
x_train_tfidf_ngram = tfidf_transformer.transform(x_train)
x_valid_tfidf_ngram = tfidf_transformer.transform(x_valid)
x_test_tfidf_ngram = tfidf_transformer.transform(test_texts)

train_features = hstack([np.array(train_bow_feature), x_train_counts, x_train_tfidf_word, x_train_tfidf_ngram])
valid_features = hstack([np.array(valid_bow_feature), x_valid_counts, x_valid_tfidf_word, x_valid_tfidf_ngram]) 
test_features = hstack([np.array(test_bow_feature), x_test_counts, x_test_tfidf_word, x_test_tfidf_ngram])


x_train_tfidf_ngram, y_train=shuffle(train_features, y_train)
clf = LogisticRegression(random_state=2019, solver='saga',  # 优化算法：liblinear、lbfgs、newton-cg、sag
                             multi_class='multinomial',  # 分类方式：multinomial、ovr
                             max_iter=1000).fit(x_train_tfidf_ngram, y_train)
predict = clf.predict(valid_features)
print(np.mean(predict == y_valid))

reslts = clf.predict(test_features)
submission = pd.read_csv('./sentiment-analysis-on-movie-reviews/sampleSubmission.csv')
submission['Sentiment'] = reslts
submission.to_csv('./all_features_submission.csv', index=None)

0.6640715109573241


## TODO:
### 1、数据清洗
### 2、尝试其他分类器
### 3、利用matplotlib进行数据探索性分析

In [1]:
# def get_word_frequncy(all_sentences):
#     bag_of_words = {}
#     for sentence in all_sentences:
#         for word in sentence.split():
#             if word in bag_of_words:
#                 bag_of_words[word] += 1
#             else:
#                 bag_of_words[word] = 1
#     return bag_of_words