In [1]:
import torchtext 
from torchtext import data
import torchtext.vocab as vocab

import torch
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import jieba.posseg
import random
import re
import os
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score


import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Text_Classfier_Data():
    def __init__(self):
        self.path = os.path.abspath('.')
        self.df = pd.read_csv(os.path.join(self.path,'data\ChnSentiCorp_htl_all.csv'))
        self.df['review'] = self.df['review'].astype(str)
    def data_info(self):
        print(self.df.info())
    def distri_label(self):
        print(self.df['label'].value_counts())
    def clear_content(self):
#         pattern = r'''[!\"#$%&'()*+,-./:;<=>?@[\\\]^_`{|}~—！，。？·￥、《》···【】：" "''\s0-9]+'''
#         re_obj = re.compile(pattern)
        patten = r"[!\"#$%&'()*+,-./:;<=>?@[\\\]^_`{|}~—！，。？·￥、《》···【】：" "''\s0-9]+"  
        re_obj = re.compile(patten)

        # 替换函数--去除标点符号和数字
        def clear(text):
            return re_obj.sub('', text)

        # 将正则表达式替换函数应用于每一行
        self.df["review"] = self.df["review"].apply(clear)
#         self.df['review'] = self.df['review'].apply(lambda x:re_obj.sub('',x))
    def cut_sentence(self):
        self.df['review'] = self.df['review'].apply(lambda x:jieba.lcut(x))
    def remove_stoplist(self):
        stop_words = "data/stoplist.txt"
        stoplist = [i.strip() for i in open(stop_words, encoding='utf-8').readlines()]  #读取停用词列表
        self.df['review'] = self.df['review'].apply(lambda x:' '.join([word for word in x if word not in stoplist]))
    def get_text_vector(self,min_df,max_df,max_features):
        self.tfidf = TfidfVectorizer(min_df = min_df,max_df = max_df,max_features = max_features)
        self.tfidf.fit(self.df['review'])
        text = self.tfidf.transform(self.df['review'])
        return text
    def find_optimal_clusters(self,text,max_k):
        iters = range(2,max_k+1,2)
        sse = []
        for k in iters:
            km = MiniBatchKMeans(n_clusters = k,init_size = 1024,batch_size = 2048,random_state = 20).fit(text)
            sse.append(km.inertia_)
            print('Fit {} clusters'.format(k))
        f,ax = plt.subplots(1,1)
        ax.plot(iters,sse,marker = 'o')
        ax.set_xlabel('Cluster Centers')
        ax.set_xticks(iters)
        ax.set_xticklabels(iters)
        ax.set_ylabel('SSE')
        ax.set_title('SSE by Cluster Plot')
    def plot_tnse_pca(self,text,cluster):
        max_label = max(cluster)
        max_items = np.random.choice(range(text.shape[0]),size = 3000,replace = False)
        tdata = np.asarray(text[max_items,:].todense())
        pca = PCA(n_components = 2).fit_transform(tdata)
        tsne = TSNE().fit_transform(PCA(n_components = 50).fit_transform(tdata))
        
        
        idx = np.random.choice(range(pca.shape[0]),size = 300,replace = False)
        label_subset = cluster[max_items]
        label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
        
        
        f,ax = plt.subplots(1,2,figsize = (14,8))
        ax[0].scatter(pca[idx,0],pca[idx,1],c = label_subset)
        ax[0].set_title('PCA Cluster Plot')
        
        ax[1].scatter(tsne[idx,0],tsne[idx,1],c = label_subset)
        ax[1].set_title('TSNE Cluster Plot')
    def get_top_keywords(self,text,clusters,labels,n_terms):
        df = pd.DataFrame(text.todense()).groupby(clusters).mean()
        for i,r in df.iterrows():
            print('\n Cluster {}'.format(i))
            print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
    def replace_word(self,sentence):
        wordlist = []
        wordlist += [word.strip() for word in open('data/负面情感词语.txt',encoding = 'gbk').readlines()]
        wordlist += [word.strip() for word in open('data/负面评价词语.txt',encoding = 'gbk').readlines()]
        L = len(wordlist) - 1
        flag = 1
        sentence_tag = jieba.posseg.cut(sentence.strip())
        ans = ''
        for i,w in enumerate(sentence_tag):
            if w.flag != 'a':
                ans += w.word
            else:
                index = random.randint(1,L)
                ans += wordlist[index]
                ans += ''.join([w.word for j ,w in enumerate(sentence_tag) if j > i])
                flag = 0
                break
        if flag:
            ans += ''.join([w.word for w in sentence_tag])
            index = random.randint(1,L)
            ans += wordlist[index]
        return ans
    def get_balance_distri(self):
        df = self.df
        neg_df = df[df['label'] == 0].copy(deep = True)
        neg_df['review'] = neg_df['review'].apply(lambda x:self.replace_word(x))
        con_df = pd.concat([self.df,neg_df])
        return con_df
    def split_data(self,df,split = 8000):
        self.train_df,self.valid_df,self.test_df = df[:split],df[split:split + 1000],df[split+1000:]
    def store_data(self):
        self.train_df[['review','label']].to_csv('data/train.csv',index = False)
        self.valid_df[['review','label']].to_csv('data/valid.csv',index = False)
        self.test_df[['review','label']].to_csv('data/test.csv',index = False)

In [None]:
if __name__=='__main__':
    td = Text_Classfier_Data()
    td.data_info()
    td.distri_label()
    td.clear_content()
    td.cut_sentence()
    td.remove_stoplist()
    text = td.get_text_vector(5,0.95,8000)
    td.find_optimal_clusters(text,20)
    clusters = MiniBatchKMeans(n_clusters = 5,init_size = 1024,batch_size = 2048,random_state = 20).fit_predict(text)
    td.plot_tnse_pca(text,clusters)
    td.get_top_keywords(text,clusters,td.tfidf.get_feature_names_out(),10)
    km = MiniBatchKMeans(n_clusters = 5,init_size = 1024,batch_size = 1024,random_state = 20).fit(text)
    td.df['cluster'] = km.labels_
    td.df['label'].value_counts()
    td.df[td.df['label'] == 0].value_counts()
    con_df = td.get_balance_distri()
    print('\n------------new data distribution---------\n')
    print(con_df['label'].value_counts())
    td.split_data(con_df)
    td.store_data()
    print('finished')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\m1824\AppData\Local\Temp\jieba.cache


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7766 entries, 0 to 7765
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   7766 non-null   int64 
 1   review  7766 non-null   object
dtypes: int64(1), object(1)
memory usage: 121.5+ KB
None
1    5322
0    2444
Name: label, dtype: int64


Loading model cost 1.289 seconds.
Prefix dict has been built successfully.


Fit 2 clusters
Fit 4 clusters
Fit 6 clusters
Fit 8 clusters
Fit 10 clusters
Fit 12 clusters
Fit 14 clusters
Fit 16 clusters
Fit 18 clusters
Fit 20 clusters


In [None]:
def DataLoader():
    tokenize = lambda x:x.split()
    REVIEW = data.Field(sequential = True,tokenize = tokenize,fix_length = 256)
    LABEL = data.LabelField(sequential = False,use_vocab = False)

    train_data,valid_data,test_data = data.TabularDataset.splits(
                                     path = 'data',
                                     train = 'train.csv',
                                     validation = 'valid.csv',test = 'test.csv',
                                     format = 'csv',
                                     fields = [('review',REVIEW),('label',LABEL)],
                                     skip_header = True)
    return REVIEW,LABEL,train_data,valid_data,test_data

In [None]:
REVIEW,LABEL,train_data,valid_data,test_data = DataLoader()
REVIEW.build_vocab(train_data)

In [None]:
train_iter,val_iter,test_iter = data.BucketIterator.splits((train_data,valid_data,test_data),
                                                    batch_size = 64,
                                                    sort = False,
                                                    sort_within_batch = False,repeat = False)
def getTEXT():
    return REVIEW
def getLABEL():
    return LABEL
def getIter():
    return train_iter,val_iter,test_iter

In [None]:
batch = next(test_iter.__iter__())

In [None]:
batch

In [None]:
SEED = 1234
torch.manual_seed(SEED)

class TextRNN_Attention(nn.Module):
    def __init__(self,args):
        super(TextRNN_Attention, self).__init__()
        Vocab = len(getTEXT().vocab)  ## 已知词的数量
        Dim = 256  ##每个词向量长度
        dropout = 0.2
        hidden_size = 256 #隐藏层数量
        num_classes = args.class_num ##类别数
        num_layers = 2 ##双层LSTM

        self.embedding = nn.Embedding(Vocab, Dim)  ## 词向量，这里直接随机
        self.lstm = nn.LSTM(Dim, hidden_size, num_layers,
                            bidirectional = True, batch_first = True, dropout = args.dropout)
        self.tanh1 = nn.Tanh()
        self.w = nn.Parameter(torch.zeros(hidden_size * 2))
        self.tanh2 = nn.Tanh()
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        # [batch len, text size]
        x = self.embedding(x)
        # [batch size, text size, embedding]
        output, (hidden, cell) = self.lstm(x)
        # output = [batch size, text size, num_directions * hidden_size]
        M = self.tanh1(output)
        # [batch size, text size, num_directions * hidden_size]
        alpha = F.softmax(torch.matmul(M, self.w), dim=1).unsqueeze(-1)
        # [batch size, text size, 1]
        out = output * alpha
        # [batch size, text size, num_directions * hidden_size]
        out = torch.sum(out, 1)
        # [batch size, num_directions * hidden_size]
        out = F.relu(out)
        # [batch size, num_directions * hidden_size]
        out = self.fc(out)
        # [batch size, num_classes]
        return out

In [None]:
class Config():
    def __init__(self):
        self.class_num = 2
        self.fix_length = 256
        self.batch_size = 256
        # data label list
        self.label_list = ['好', '差']
        class_number = len(self.label_list)
        # train details
        self.epochs = ６
        self.learning_rate = 1e-4
        self.dropout = 0.2
args = Config()

In [None]:
def test_model(test_iter, name, device,args):
    model = torch.load('done_model/'+name+'_model.pkl')
    model = model.to(device)
    model.eval()
    total_loss = 0.0
    accuracy = 0
    y_true = []
    y_pred = []
    total_test_num = len(test_iter.dataset)
    for batch in test_iter:
        feature = batch.review
        target = batch.label
        with torch.no_grad():
            feature = torch.t(feature)
        feature, target = feature.to(device), target.to(device)
        out = model(feature)
        loss = F.cross_entropy(out, target)
        total_loss += loss.item()
        accuracy += (torch.argmax(out, dim=1)==target).sum().item()
        y_true.extend(target.cpu().numpy())
        y_pred.extend(torch.argmax(out, dim=1).cpu().numpy())
    print('>>> Test loss:{}, Accuracy:{} \n'.format(total_loss/total_test_num, accuracy/total_test_num))
    score = accuracy_score(y_true, y_pred)
    print(score)
    from sklearn.metrics import classification_report
    print(classification_report(y_true, y_pred, target_names = args.label_list, digits=3))

def train_model(train_iter, dev_iter, model, name, device,args):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr = args.learning_rate)
    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15, 25], gamma=0.6)
    model.train()
    best_acc = 0
    print('training...')
    for epoch in range(1, args.epochs + 1):
        model.train()
        total_loss = 0.0
        accuracy = 0
        total_train_num = len(train_iter.dataset)
        progress_bar = tqdm(enumerate(train_iter), total=len(train_iter))
        for i,batch in progress_bar:
            feature = batch.review
            target = batch.label
            with torch.no_grad():
                feature = torch.t(feature)
            feature, target = feature.to(device), target.to(device)
            optimizer.zero_grad()
            logit = model(feature)
            loss = F.cross_entropy(logit, target)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            accuracy += (torch.argmax(logit, dim=1) == target).sum().item()
            progress_bar.set_description(
            f'loss: {loss.item():.3f}')
        print('>>> Epoch_{}, Train loss is {}, Accuracy:{} \n'.format(epoch,loss.item()/total_train_num, accuracy/total_train_num))
        model.eval()
        total_loss = 0.0
        accuracy = 0
        total_valid_num = len(dev_iter.dataset)
        progress_bar = tqdm(enumerate(dev_iter), total=len(dev_iter))
        for i, batch in progress_bar:
            feature = batch.review  # (W,N) (N)
            target = batch.label
            with torch.no_grad():
                feature = torch.t(feature)
            feature, target = feature.to(device), target.to(device)
            out = model(feature)
            loss = F.cross_entropy(out, target)
            total_loss += loss.item()
            accuracy += (torch.argmax(out, dim=1)==target).sum().item()
        print('>>> Epoch_{}, Valid loss:{}, Accuracy:{} \n'.format(epoch, total_loss/total_valid_num, accuracy/total_valid_num))
        if(accuracy/total_valid_num > best_acc):
            print('save model...')
            best_acc = accuracy/total_valid_num
            saveModel(model, name=name)

def saveModel(model,name):
    torch.save(model, 'done_model/'+name+'_model.pkl')

name = 'TextRNN_Attention'
model = TextRNN_Attention(args)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_iter, val_iter, test_iter = getIter()

if __name__ == '__main__':
    train_model(train_iter, val_iter, model, name, device,args)
    test_model(test_iter, name, device,args)