# 情感分析

## 1.导入数据

In [1]:
import pandas as pd
d = pd.read_csv("train.csv",lineterminator='\n')
import numpy as np
Y = np.array(d.label == "Positive",dtype = 'float')
Y

array([0., 1., 0., ..., 0., 0., 1.])

In [2]:
X = d.review
X

0                            Jo bhi ap se tou behtar hoon
1               ya Allah meri sister Affia ki madad farma
2       Yeh khud chahta a is umar main shadi krna.  ha...
3             Tc ? Apky mun xe exe alfax achy nae lgty 😒💃
4                                                    Good
5       American president John f Kennedy aur in ke bh...
6        Commission aur kickback ka dor Dora raha, quo...
7       Allah pak nazer e bd sy bechye or humesha boha...
8       Amoman log samajhte hain ke jhok siyal hi Abid...
9       Akki KhanYani k tum ....... v good Wesy tum sh...
10      Jail Road Pr Firing Se 1 Shaks Janbahaq Rpt M ...
11      hud hagai stupid actor hy.....acting ati he na...
12                                Haha thank you so much 
13      Pakistani cricket ki tareekh ka behtreen batsm...
14      Nawaz Sharif ko Pakro Harimi Police walo Tum l...
15                   Kash MERI MAAN ZINDA HOTI...........
16      5 Billion Dollor se zaid akhrajat ka takhmina ...
17      Kisi l

## 2.特征工程，分词器

In [3]:
import re
import nltk
import string

emoticons_str = r"""
    (?:
        [:=;]   
        [oO\-]?  
        [D\)\]\(\]/\\OpP] 
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>',  # HTML
    r'(?:@[\w_]+)',  # @
    r"(?:\#+[\w_]+[\w\'_\-]*[\w]+)",  # 话题标签
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs
    r"(?:[a-z][a-z'\-_]+[a-z])",  # 含有- 和’ 的单词
    r'(?:[\w_]+)',  # 其他_
    r'(?:\S)'  # 所有匹配
]

tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re=re.compile(r'^'+emoticons_str+'$',re.VERBOSE|re.IGNORECASE)

def tokenizer_porter(text):
    punc = string.punctuation
    tokens = tokens_re.findall(text)
    return [word for word in tokens if word not in punc and not word.isdigit()]
tokenizer_porter("Hahahhahhhhahahahahahhahahh oy hmko to mrd e ni ????  123 milta tu the kamyab mrd dhoondny ki baat kra hy 🤣🤣🤣😆😆😆😭😭😭😭")

['Hahahhahhhhahahahahahhahahh',
 'oy',
 'hmko',
 'to',
 'mrd',
 'e',
 'ni',
 'milta',
 'tu',
 'the',
 'kamyab',
 'mrd',
 'dhoondny',
 'ki',
 'baat',
 'kra',
 'hy',
 '🤣',
 '🤣',
 '🤣',
 '😆',
 '😆',
 '😆',
 '😭',
 '😭',
 '😭',
 '😭']

## 3.GridSearchCV tfidf+lr

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
tfidf = TfidfVectorizer()
param_grid = [{
              "vect__tokenizer":[tokenizer_porter,None],
              "clf__penalty":["l1","l2"],
              "clf__C":[0.1,1.0,10.0,100.0]}
             ]
lr_tfidf = Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state=0))])
                
gs_lr_tfid = GridSearchCV(lr_tfidf,param_grid,scoring='roc_auc',cv=5,verbose=1,n_jobs=-1)
                
gs_lr_tfid.fit(X,Y)
print(gs_lr_tfid.best_params_)
print(gs_lr_tfid.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  3.1min finished


{'clf__C': 100.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 2), 'vect__tokenizer': None}




### tfidf+lr

In [17]:
from sklearn.model_selection import cross_val_score
best_tfidf = TfidfVectorizer(tokenizer = tokenizer_porter,ngram_range=(1, 2),max_df=0.5)
best_lr_tfidf = LogisticRegression(C=100.0, penalty='l2')
cross_result = cross_val_score(best_lr_tfidf,best_tfidf.fit_transform(X),Y,cv=10,scoring='roc_auc')
np.mean(cross_result)



0.8543895924578919

### tfidf+NB

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
tfidf = TfidfVectorizer(tokenizer = tokenizer_porter)
features = tfidf.fit_transform(X);
cross_result = cross_val_score(mnb,features,Y,cv=10,scoring='roc_auc')
X

0                            Jo bhi ap se tou behtar hoon
1               ya Allah meri sister Affia ki madad farma
2       Yeh khud chahta a is umar main shadi krna.  ha...
3             Tc ? Apky mun xe exe alfax achy nae lgty 😒💃
4                                                    Good
5       American president John f Kennedy aur in ke bh...
6        Commission aur kickback ka dor Dora raha, quo...
7       Allah pak nazer e bd sy bechye or humesha boha...
8       Amoman log samajhte hain ke jhok siyal hi Abid...
9       Akki KhanYani k tum ....... v good Wesy tum sh...
10      Jail Road Pr Firing Se 1 Shaks Janbahaq Rpt M ...
11      hud hagai stupid actor hy.....acting ati he na...
12                                Haha thank you so much 
13      Pakistani cricket ki tareekh ka behtreen batsm...
14      Nawaz Sharif ko Pakro Harimi Police walo Tum l...
15                   Kash MERI MAAN ZINDA HOTI...........
16      5 Billion Dollor se zaid akhrajat ka takhmina ...
17      Kisi l

## 4.输出结果

In [16]:
test = pd.read_csv("20190520_test.csv",lineterminator='\n')
mnb.fit(tfidf.transform(X),Y)
pred = mnb.predict_proba(tfidf.transform(test.review))
pred
data = {'ID':np.array(test['ID']),"Pred":pred[:,1]}
pd.DataFrame(data).to_csv('atest_pred2.csv', header=True,index=False)
data

{'ID': array([   1,    2,    3, ..., 2710, 2711, 2712]),
 'Pred': array([0.41612916, 0.03755352, 0.52017414, ..., 0.76806404, 0.6015265 ,
        0.80126074])}

In [26]:
test = pd.read_csv("20190513_test.csv",lineterminator='\n')
best_lr_tfidf.fit(best_tfidf.transform(X),Y)
pred = best_lr_tfidf.predict_proba(best_tfidf.transform(test['review']))
pred
best_tfidf.transform(d['review']).toarray()
pred

array([[0.00825091, 0.99174909],
       [0.02411725, 0.97588275],
       [0.05489165, 0.94510835],
       ...,
       [0.38976453, 0.61023547],
       [0.59693397, 0.40306603],
       [0.72226277, 0.27773723]])

In [33]:
data = {'ID':np.array(test['ID']),"Pred":pred[:,1]}
pd.DataFrame(data).to_csv('atest_pred.csv', header=True,index=False)
data

{'ID': array([   1,    2,    3, ..., 2710, 2711, 2712]),
 'Pred': array([0.99174909, 0.97588275, 0.94510835, ..., 0.61023547, 0.40306603,
        0.27773723])}