### Data Loading 

In [54]:
import pandas as pd
import glob ,os

# 아래는 제 컴퓨터에서 압축 파일을 풀어 놓은 디렉토리이니, 여러분의 디렉토리를 설정해 주십시요  
path = r'C:\Users\y9941\perfectGuide\data\OpinosisDataset1.0\topics'                     
# path로 지정한 디렉토리 밑에 있는 모든 .data 파일들의 파일명을 리스트로 취합
all_files = glob.glob(os.path.join(path, "*.data"))    
filename_list = []
opinion_text = []

# 개별 파일들의 파일명은 filename_list 리스트로 취합, 
# 개별 파일들의 파일내용은 DataFrame로딩 후 다시 string으로 변환하여 opinion_text 리스트로 취합 
for file_ in all_files:
    # 개별 파일을 읽어서 DataFrame으로 생성 
    df = pd.read_table(file_,index_col=None, header=0,encoding='latin1')
    
    # 절대경로로 주어진 file 명을 가공. 만일 Linux에서 수행시에는 아래 \\를 / 변경. 맨 마지막 .data 확장자도 제거
    filename_ = file_.split('\\')[-1]
    filename = filename_.split('.')[0]

    #파일명 리스트와 파일내용 리스트에 파일명과 파일 내용을 추가. 
    filename_list.append(filename)
    opinion_text.append(df.to_string())

# 파일명 리스트와 파일내용 리스트를  DataFrame으로 생성
document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,accuracy_garmin_nuvi_255W_gps,", and is very, very acc..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my ...
3,battery-life_ipod_nano_8gb,short battery life I moved up from a...
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 2..."


In [55]:
document_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 2 columns):
filename        51 non-null object
opinion_text    51 non-null object
dtypes: object(2)
memory usage: 944.0+ bytes


In [56]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\y9941\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\y9941\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\y9941\AppData\Roaming\nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\y9941\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\y9941\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     C:\Users\y

True

### Lemmatization을 위한 함수 생성

In [57]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

# nltk는 
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

### TF-IDF 피처 벡터화, TfidfVectorizer에서 피처 벡터화 수행 시 Lemmatization을 적용하여 토큰화

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english' , \
                             ngram_range=(1,2), min_df=0.05, max_df=0.85 )

#opinion_text 컬럼값으로 feature vectorization 수행
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])


  'stop_words.' % sorted(inconsistent))


### 3개의 군집으로 clustering

In [59]:
from sklearn.cluster import KMeans

# 3개의 집합으로 군집화 
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_


# 소속 클러스터를 cluster_label 컬럼으로 할당하고 cluster_label 값으로 정렬
document_df['cluster_label'] = cluster_label
document_df.sort_values(by='cluster_label')

Unnamed: 0,filename,opinion_text,cluster_label
0,accuracy_garmin_nuvi_255W_gps,", and is very, very acc...",0
48,updates_garmin_nuvi_255W_gps,Another thing to consider was that I paid $...,0
44,speed_windows7,"Windows 7 is quite simply faster, more sta...",0
43,speed_garmin_nuvi_255W_gps,Another feature on the 255w is a display of...,0
42,sound_ipod_nano_8gb,headphone jack i got a clear case for it a...,0
41,size_asus_netbook_1005ha,A few other things I'd like to point out i...,0
36,screen_netbook_1005ha,Keep in mind that once you get in a room ...,0
35,screen_ipod_nano_8gb,"As always, the video screen is sharp and b...",0
34,screen_garmin_nuvi_255W_gps,It is easy to read and when touching the...,0
33,satellite_garmin_nuvi_255W_gps,It's fast to acquire satel...,0


## vader

In [60]:
def vader_polarity(review,threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)
    
    # compound 값에 기반하여 threshold 입력값보다 크면 1, 그렇지 않으면 0을 반환 
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

# apply lambda 식을 이용하여 레코드별로 vader_polarity( )를 수행하고 결과를 'vader_preds'에 저장
document_df['vader_preds'] = document_df['opinion_text'].apply( lambda x : vader_polarity(x, 0.1) )
#y_target = document_df['sentiment'].values
vader_preds = document_df['vader_preds'].values

In [61]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(document_df['opinion_text'][0])
print(senti_scores)

{'neg': 0.014, 'neu': 0.798, 'pos': 0.188, 'compound': 0.9992}


In [62]:
document_df

Unnamed: 0,filename,opinion_text,cluster_label,vader_preds
0,accuracy_garmin_nuvi_255W_gps,", and is very, very acc...",0,1
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and...",1,1
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my ...,0,1
3,battery-life_ipod_nano_8gb,short battery life I moved up from a...,0,1
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 2...",0,1
5,buttons_amazon_kindle,I thought it would be fitting to christen ...,0,1
6,comfort_honda_accord_2008,"Drivers seat not comfortable, the car its...",2,1
7,comfort_toyota_camry_2007,Ride seems comfortable and gas mileage fa...,2,1
8,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken ...,0,1
9,display_garmin_nuvi_255W_gps,3 quot widescreen display was a ...,0,1


## SentiWordNet

In [63]:
from nltk.corpus import wordnet as wn

# 간단한 NTLK PennTreebank Tag를 기반으로 WordNet기반의 품사 Tag로 변환
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return 


In [64]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag

def swn_polarity(text):
    # 감성 지수 초기화 
    sentiment = 0.0
    tokens_count = 0
    
    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 합산 
    for raw_sentence in raw_sentences:
        # NTLK 기반의 품사 태깅 문장 추출  
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
        for word , tag in tagged_sentence:
            
            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN , wn.ADJ, wn.ADV):
                continue                   
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성. 
            synsets = wn.synsets(lemma , pos=wn_tag)
            if not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산. 
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())           
            tokens_count += 1
    
    if not tokens_count:
        return 0
    
    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환
    if sentiment >= 0 :
        return 1
    
    return 0


In [65]:
document_df['senti_preds'] = document_df['opinion_text'].apply( lambda x : swn_polarity(x) )

In [66]:
document_df

Unnamed: 0,filename,opinion_text,cluster_label,vader_preds,senti_preds
0,accuracy_garmin_nuvi_255W_gps,", and is very, very acc...",0,1,1
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and...",1,1,0
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my ...,0,1,1
3,battery-life_ipod_nano_8gb,short battery life I moved up from a...,0,1,1
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 2...",0,1,1
5,buttons_amazon_kindle,I thought it would be fitting to christen ...,0,1,1
6,comfort_honda_accord_2008,"Drivers seat not comfortable, the car its...",2,1,0
7,comfort_toyota_camry_2007,Ride seems comfortable and gas mileage fa...,2,1,0
8,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken ...,0,1,0
9,display_garmin_nuvi_255W_gps,3 quot widescreen display was a ...,0,1,1


In [67]:
document_df[cluster_label==0].describe()

Unnamed: 0,cluster_label,vader_preds,senti_preds
count,25.0,25.0,25.0
mean,0.0,1.0,0.84
std,0.0,0.0,0.374166
min,0.0,1.0,0.0
25%,0.0,1.0,1.0
50%,0.0,1.0,1.0
75%,0.0,1.0,1.0
max,0.0,1.0,1.0


In [68]:
document_df[cluster_label==1].describe()

Unnamed: 0,cluster_label,vader_preds,senti_preds
count,16.0,16.0,16.0
mean,1.0,1.0,0.8125
std,0.0,0.0,0.403113
min,1.0,1.0,0.0
25%,1.0,1.0,1.0
50%,1.0,1.0,1.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [69]:
document_df[cluster_label==2].describe()

Unnamed: 0,cluster_label,vader_preds,senti_preds
count,10.0,10.0,10.0
mean,2.0,0.9,0.6
std,0.0,0.316228,0.516398
min,2.0,0.0,0.0
25%,2.0,1.0,0.0
50%,2.0,1.0,1.0
75%,2.0,1.0,1.0
max,2.0,1.0,1.0
