In [30]:
import pandas as pd
import re
import nltk.stem as stem
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

stemmer = stem.snowball.EnglishStemmer()

train = pd.read_csv('~/Downloads/labeledTrainData.tsv', sep='\t')
unlabeled_train = pd.read_csv('~/Downloads/unlabeledTrainData.tsv', sep='\t', quoting=3)
test = pd.read_csv('~/Downloads/testData.tsv', sep='\t', encoding='ISO-8859-1')
y_train = train['sentiment']
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [31]:
def clean_text(texts):
    text_list = []
    for text in texts:
        # 将单词转换为小写
        text = text.lower()
        # 删除非字母、数字字符
        text = re.sub(r'[^a-z\']', ' ', text)
        # 恢复常见的简写
        text = re.sub(r'what\'s', 'what is ', text)
        text = re.sub(r'\'s', ' ', text)
        text = re.sub(r'\'ve', ' have ', text)
        text = re.sub(r'can\'t', 'can not ', text)
        text = re.sub(r'cannot', 'can not ', text)
        text = re.sub(r'n\'t', ' not ', text)
        text = re.sub(r'\'m', ' am ', text)
        text = re.sub(r'\'re', ' are ', text)
        text = re.sub(r'\'d', ' will ', text)
        text = re.sub(r'ain\'t', ' are not ', text)
        text = re.sub(r'aren\'t', ' are not ', text)
        text = re.sub(r'couldn\'t', ' can not ', text)
        text = re.sub(r'didn\'t', ' do not ', text)
        text = re.sub(r'doesn\'t', ' do not ', text)
        text = re.sub(r'don\'t', ' do not ', text)
        text = re.sub(r'hadn\'t', ' have not ', text)
        text = re.sub(r'hasn\'t', ' have not ', text)
        text = re.sub(r'\'ll', ' will ', text)
        #进行词干提取
        new_text = ''
        for word in word_tokenize(text):
            new_text = new_text + ' ' + stemmer.stem(word)

        text_list.append(new_text)
    return text_list

train_data_content = clean_text(list(train['review']))
test_data_content = clean_text(list(test['review']))

In [33]:
train_data_content[1]

" the classic war of the world by timothi hine is a veri entertain film that obvious goe to great effort and length to faith recreat h g well ' classic book mr hine succeed in do so i and those who watch his film with me appreci the fact that it was not the standard predict hollywood fare that come out everi year e g the spielberg version with tom cruis that had onli the slightest resembl to the book obvious everyon look for differ thing in a movi those who envis themselv as amateur critic look onli to critic everyth they can other rate a movi on more import base like be entertain which is whi most peopl never agre with the critic we enjoy the effort mr hine put into be faith to h g well ' classic novel and we found it to be veri entertain this made it easi to overlook what the critic perceiv to be it shortcom"

In [34]:
# TF-IDF
all_text_list = list(train_data_content) + list(test_data_content)
text_vector = TfidfVectorizer(sublinear_tf=True, strip_accents='unicode',token_pattern=r'\w{1,}',
                              max_features=5000, ngram_range=(1, 1), analyzer='word')
text_vector.fit(all_text_list)
X_train = text_vector.transform(train_data_content).toarray()
X_test = text_vector.transform(test_data_content).toarray()


In [35]:
print(X_train.shape, X_test.shape, type(X_train))

(25000, 5000) (5000, 5000) <class 'numpy.ndarray'>


In [42]:
# train model
model = LogisticRegression(C=100.0)
model.fit(X_train, y_train)
pred = model.predict(X_test)
#train_scores = model.score(X_train, y_train)
#print(train_scores)
#predictions = model.predict_proba(X_test)
#print(predictions.shape)



In [43]:
pred

array([0, 1, 1, ..., 0, 0, 1])

In [44]:
submission = pd.read_csv('~/Downloads/sampleSubmission.csv')
submission['sentiment'] = pred
submission.to_csv('~/Downloads/m_submission.csv', index=False) 