# 아마존 리뷰 텍스트 파일로 감성 사전 구축하기
- 로지스틱 회귀 모델 사용

In [3]:
import pandas as pd

df = pd.read_csv('./amazon_cells_labelled.txt', sep = "\t", header = None)
df.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1000 non-null   object
 1   1       1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
X = df[0]
y = df[1]

In [9]:
tfidf = TfidfVectorizer(stop_words = 'english', max_features = 1000)
X_tdm = tfidf.fit_transform(X)

In [10]:
X_tdm.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
tfidf.get_feature_names()

['10',
 '100',
 '11',
 '12',
 '13',
 '15',
 '15g',
 '18',
 '20',
 '2000',
 '2005',
 '2160',
 '24',
 '2mp',
 '325',
 '350',
 '375',
 '3o',
 '42',
 '44',
 '45',
 '4s',
 '50',
 '5020',
 '510',
 '5320',
 '680',
 '700w',
 '8125',
 '8525',
 '8530',
 'abhor',
 'ability',
 'able',
 'abound',
 'absolutel',
 'absolutely',
 'ac',
 'accept',
 'acceptable',
 'access',
 'accessory',
 'accessoryone',
 'accidentally',
 'accompanied',
 'actually',
 'ad',
 'adapter',
 'adapters',
 'add',
 'addition',
 'additional',
 'address',
 'adhesive',
 'adorable',
 'advertised',
 'advise',
 'aggravating',
 'ago',
 'alarm',
 'allot',
 'allow',
 'allowing',
 'allows',
 'alot',
 'aluminum',
 'amazed',
 'amazing',
 'amazon',
 'amp',
 'ample',
 'angeles',
 'angle',
 'answer',
 'ant',
 'antena',
 'anti',
 'apart',
 'apartment',
 'apparently',
 'appealing',
 'appearance',
 'appears',
 'applifies',
 'appointments',
 'area',
 'arguing',
 'armband',
 'arrival',
 'arrived',
 'asia',
 'ask',
 'aspect',
 'assumed',
 'atleast',


In [13]:
tfidf.inverse_transform(X_tdm[0])

[array(['unless', 'plug', 'way'], dtype='<U15')]

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_tdm, y, test_size = 0.3, random_state = 103)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(700, 1000) (300, 1000) (700,) (300,)


In [18]:
from sklearn.linear_model import LogisticRegression
#from sklearn.linear_model import LogisticRegressionCV
#cross-validation 과정 포함하여 기본 로지스틱 회귀보다 성능이 조금 더 좋음
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression() # 로지스틱 회귀 분류기
lr_clf.fit(x_train, y_train)

print("score: ", lr_clf.score(x_train, y_train))

pred = lr_clf.predict(x_test)

print("accuracy: ", accuracy_score(y_test, pred))

score:  0.9542857142857143
accuracy:  0.7766666666666666


In [19]:
lr_clf.coef_ # 단어별로 회귀 계수 확인

array([[ 4.17243034e-01,  0.00000000e+00,  0.00000000e+00,
        -1.45332451e-01,  0.00000000e+00,  0.00000000e+00,
        -1.61992991e-01,  1.12130591e-01,  3.35577475e-02,
         0.00000000e+00,  1.71656208e-01, -1.44229014e-01,
         0.00000000e+00,  5.86631178e-02,  1.07210888e-01,
         6.41603249e-02,  0.00000000e+00, -1.49336931e-01,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -1.25778673e-01, -2.32357753e-01,  0.00000000e+00,
         1.72889123e-01, -1.54620458e-01,  6.49084545e-02,
         1.50411162e-01, -1.10049166e-01, -1.43365627e-01,
         2.58194833e-01,  0.00000000e+00, -2.92491314e-01,
        -9.66431565e-02, -2.48663777e-01, -2.15184808e-01,
         5.13136224e-01,  2.06846030e-01, -3.07670390e-01,
         0.00000000e+00, -2.18986684e-02, -3.04190513e-01,
        -1.95203854e-01, -2.23129116e-01,  3.85951617e-01,
        -4.02182805e-02, -1.61932873e-01, -1.53409162e-01,
         1.40726289e-01, -1.66494523e-01, -1.63064346e-0

In [20]:
st_df = pd.DataFrame({"단어" : tfidf.get_feature_names(),
                     "회귀 계수" : lr_clf.coef_.flat})

st_df.tail()

Unnamed: 0,단어,회귀 계수
995,wrongly,-0.128435
996,year,-0.360836
997,years,0.381672
998,yell,-0.231318
999,yes,0.21889


### 회귀 계수 < 0 : 부정적 단어  
### 회귀 계수 > 0 : 긍정적 단어

In [23]:
st_neg = st_df[st_df["회귀 계수"] < 0].sort_values("회귀 계수")
st_neg.head()

Unnamed: 0,단어,회귀 계수
492,poor,-1.580278
103,bad,-1.373457
988,worst,-1.170627
323,hear,-1.126084
333,horrible,-1.100536


In [30]:
st_pos = st_df[st_df['회귀 계수'] > 0].sort_values("회귀 계수", ascending = False)
st_pos.head()

Unnamed: 0,단어,회귀 계수,극성
310,great,3.375385,1.0
985,works,2.260295,1.0
306,good,1.880174,1.0
255,excellent,1.860913,1.0
591,recommend,1.796219,1.0


In [31]:
import numpy as np

st_df["극성"] = np.sign(st_df['회귀 계수'])
# np.sign(x) : 배열 원소의 부호 판별 함수 (1: positive / 0 : zero / -1 : negative)

st_df.tail()

Unnamed: 0,단어,회귀 계수,극성
995,wrongly,-0.128435,-1.0
996,year,-0.360836,-1.0
997,years,0.381672,1.0
998,yell,-0.231318,-1.0
999,yes,0.21889,1.0


In [32]:
st_df['극성'].value_counts()

-1.0    475
 1.0    408
 0.0    117
Name: 극성, dtype: int64

부정 단어 475개  
긍정 단어 117개

In [33]:
(st_df['극성'].sum()) / (st_df['극성'].abs().sum())

-0.07587768969422423