# LDA+二分类的文本分类
 - LDA慢，当countVector很大的时候，不推荐用于文本分类

In [9]:
import sys
import os
import re

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.util import ngrams

import torch as t
from torch import nn
from torch.utils import data
from torch.utils.data import DataLoader

stop_words = set(stopwords.words('english'))

In [10]:
def clean_text(s):
    s = re.sub(r'<[^>]+>',' ',s)
    s = re.sub(r'[^a-zA-Z\']',' ',s)
    s = s.lower()
    s = s.split(" ")
    s = [w for w in s if not w in stop_words]
    return " ".join(s)

In [11]:
data_path = r'E:\kaggle\movies'
train_data_path = os.path.join(data_path,'labeledTrainData.tsv')
test_data_path = os.path.join(data_path,'testData.tsv')
train_df = pd.read_csv(train_data_path,header=0,sep='\t')
test_df = pd.read_csv(test_data_path,header=0,sep='\t')
test_df['text'] = test_df.review.apply(clean_text)
train_df['text'] = train_df.review.apply(clean_text)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [13]:
train_x = []
test_x = []
for i in range(len(train_df['text'])):
    train_x.append(train_df['text'][i])
for i in range(len(test_df['text'])):
    test_x.append(test_df['text'][i])

25000

In [51]:
all_x = train_x + test_x
cntVec = CountVectorizer(max_features=5)  # max_features 会过滤掉词汇，不建议设置，但为了能跑完LDA，设小
cntTF = cntVec.fit_transform(all_x)
lda = LatentDirichletAllocation(n_components=2)
docres = lda.fit_transform(cntTF)

In [52]:
docres.shape

(50000, 2)

In [53]:
from sklearn.linear_model import LogisticRegression
cntTF.shape

(50000, 5)

In [54]:
train_y = train_df['sentiment']
docres[:5]

array([[0.48502967, 0.51497033],
       [0.48813959, 0.51186041],
       [0.58528994, 0.41471006],
       [0.77831181, 0.22168819],
       [0.50028095, 0.49971905]])

In [55]:
train_y[:5]

0    1
1    1
2    0
3    0
4    1
Name: sentiment, dtype: int64

In [56]:
lr = LogisticRegression()
lr.fit(docres[:len(train_x)], train_y)
lr_res = lr.predict(docres[len(train_x):])



In [57]:
lr_df = pd.DataFrame({'id':test_df['id'], 'sentiment':lr_res})
lr_df.to_csv(os.path.join(data_path,"LDA_LR_result.csv"), index=False)