## 라이브러리와 파일 불러오기

In [51]:
import os
import pandas as pd
import numpy as np
from glob import glob

# text processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords

# Visualisation libraries
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import chart_studio.plotly as py
import plotly.figure_factory as ff
from plotly.offline import iplot
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# Pytorch
# import torch

# Transformers
# from transformers import BertTokenizer

In [12]:
data = glob('data/tweet-sentiment-extraction/*.csv')
data

['data/tweet-sentiment-extraction/test.csv',
 'data/tweet-sentiment-extraction/train.csv',
 'data/tweet-sentiment-extraction/sample_submission.csv']

In [14]:
train = pd.read_csv(data[1])
test = pd.read_csv(data[0])

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (27481, 4)
Testing data shape:  (3534, 3)


## 데이터 살펴보기

In [20]:
display(train.head())
display(test.head())

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


## EDA

### 결측치 확인 & 제거

In [21]:
train.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [22]:
test.isnull().sum()

textID       0
text         0
sentiment    0
dtype: int64

In [23]:
# how{‘any’, ‘all’}, default ‘any’
# Determine if row or column is removed from DataFrame, when we have at least one NA or all NA.
# ‘any’ : If any NA values are present, drop that row or column.
# ‘all’ : If all values are NA, drop that row or column.
train.dropna(axis=0, how='any', inplace=True)

## 데이터 샘플 확인

In [27]:
# Positive tweet
print("Positive Tweet example :",train[train['sentiment']=='positive']['text'].values[0])
#negative_text
print("Negative Tweet example :",train[train['sentiment']=='negative']['text'].values[0])
#neutral_text
print("Neutral tweet example  :",train[train['sentiment']=='neutral']['text'].values[0])

Positive Tweet example : 2am feedings for the baby are fun when he is all smiles and coos
Negative Tweet example :  Sooo SAD I will miss you here in San Diego!!!
Neutral tweet example  :  I`d have responded, if I were going


In [29]:
# 1.5 : 1: 1
train['sentiment'].value_counts()

neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64

In [30]:
# value_counts() 비율료 확인
train['sentiment'].value_counts(normalize=True)

neutral     0.404549
positive    0.312300
negative    0.283151
Name: sentiment, dtype: float64

## 시각화

In [37]:
train['sentiment'].value_counts(normalize=True).iplot(kind='bar',
                                                      yTitle='Percentage', 
                                                      linecolor='black', 
                                                      opacity=0.7,
                                                      color='red',
                                                      theme='pearl',
                                                      bargap=0.6,
                                                      gridcolor='white',
                                                      title='Distribution of Sentiment column in the training set')

In [38]:
test['sentiment'].value_counts(normalize=True).iplot(kind='bar',
                                                      yTitle='Percentage', 
                                                      linecolor='black', 
                                                      opacity=0.7,
                                                      color='red',
                                                      theme='pearl',
                                                      bargap=0.6,
                                                      gridcolor='white',
                                                      title='Distribution  of Sentiment column in the test set')

## 텍스트 데이터 전처리

In [62]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''

    # 모든 텍스트 소문자로 변환
    text = text.lower()

    # .*? : 개행 문자(\n)를 제외한 모든 문자가 0부터 무한대로 반복되는 것 매치
    # 대괄호로 감싸져 있는 모든 문자를 대괄호와 함께 제거
    text = re.sub('\[.*?\]', '', text)

    # 링크 제거
    # \S : 공백 문자가 아닌 것과 매치, "|" : or 
    # \S+|www : 공백이 아닌 모든 문자가 1번이상 있거나 www로 시작하거나
    text = re.sub('https?://\S+|www\.\S+', '', text)


    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)

    # \w : 문자, 숫자, "_" 매치 [A-Za-z0-9_]
    # \d : 숫자와 매치 [0-9]
    text = re.sub('\w*\d\w*', '', text)
    return text

### 정규표현식 예시

In [82]:
temp = "!@#$[여기를 사라지게 해볼게]%^&*()"
temp

'!@#$[여기를 사라지게 해볼게]%^&*()'

In [86]:
re.sub('\[.*?\]', '', temp)

'!@#$%^&*()'

In [87]:
temp = "이건 안사라짐 https://www.naver.com 이건 안사라짐"
temp

'이건 안사라짐 https://www.naver.com 이건 안사라짐'

In [88]:
re.sub('https?://\S+|www\.\S+', '', temp)

'이건 안사라짐  이건 안사라짐'

In [99]:
temp = "이건 안사라짐 https://naver.com 이건 안사라짐"
temp

'이건 안사라짐 https://naver.com 이건 안사라짐'

In [100]:
re.sub('https?://\S+|www\.\S+', '', temp)

'이건 안사라짐  이건 안사라짐'

In [149]:
def clean_text1(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    return text

def clean_text2(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    return text

def clean_text3(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    return text

def clean_text4(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    return text

def clean_text5(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text

def clean_text6(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [154]:
test1 = train['text']
test1

0                      I`d have responded, if I were going
1            Sooo SAD I will miss you here in San Diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4         Sons of ****, why couldn`t they put them on t...
                               ...                        
27476     wish we could come see u on Denver  husband l...
27477     I`ve wondered about rake to.  The client has ...
27478     Yay good for both of you. Enjoy the break - y...
27479                           But it was worth it  ****.
27480       All this flirting going on - The ATG smiles...
Name: text, Length: 27480, dtype: object

In [155]:
test2 = test1.astype(str).apply(lambda x: clean_text1(x))
test2

0                      i`d have responded, if i were going
1            sooo sad i will miss you here in san diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4         sons of ****, why couldn`t they put them on t...
                               ...                        
27476     wish we could come see u on denver  husband l...
27477     i`ve wondered about rake to.  the client has ...
27478     yay good for both of you. enjoy the break - y...
27479                           but it was worth it  ****.
27480       all this flirting going on - the atg smiles...
Name: text, Length: 27480, dtype: object

In [156]:
# True : 전처리 발생 X
# False : 전처리 발생
test_df = pd.DataFrame({'test1': test1, 'test2': test2, 'same12': test1==test2})
test_df

Unnamed: 0,before,after,same
0,"I`d have responded, if I were going","i`d have responded, if i were going",False
1,Sooo SAD I will miss you here in San Diego!!!,sooo sad i will miss you here in san diego!!!,False
2,my boss is bullying me...,my boss is bullying me...,True
3,what interview! leave me alone,what interview! leave me alone,True
4,"Sons of ****, why couldn`t they put them on t...","sons of ****, why couldn`t they put them on t...",False
...,...,...,...
27476,wish we could come see u on Denver husband l...,wish we could come see u on denver husband l...,False
27477,I`ve wondered about rake to. The client has ...,i`ve wondered about rake to. the client has ...,False
27478,Yay good for both of you. Enjoy the break - y...,yay good for both of you. enjoy the break - y...,False
27479,But it was worth it ****.,but it was worth it ****.,False


In [157]:
test3 = test2.astype(str).apply(lambda x: clean_text2(x))
test4 = test3.astype(str).apply(lambda x: clean_text3(x))
test5 = test4.astype(str).apply(lambda x: clean_text4(x))
test6 = test5.astype(str).apply(lambda x: clean_text5(x))
test7 = test6.astype(str).apply(lambda x: clean_text6(x))

In [159]:
test_df = pd.DataFrame({'원본': test1, '함수1': test2, '함수2': test3, '함수3': test4, '함수4': test5, '함수5': test6, '함수6': test7})
test_df

Unnamed: 0,원본,함수1,함수2,함수3,함수4,함수5,함수6
0,"I`d have responded, if I were going","i`d have responded, if i were going","i`d have responded, if i were going","i`d have responded, if i were going",id have responded if i were going,id have responded if i were going,id have responded if i were going
1,Sooo SAD I will miss you here in San Diego!!!,sooo sad i will miss you here in san diego!!!,sooo sad i will miss you here in san diego!!!,sooo sad i will miss you here in san diego!!!,sooo sad i will miss you here in san diego,sooo sad i will miss you here in san diego,sooo sad i will miss you here in san diego
2,my boss is bullying me...,my boss is bullying me...,my boss is bullying me...,my boss is bullying me...,my boss is bullying me,my boss is bullying me,my boss is bullying me
3,what interview! leave me alone,what interview! leave me alone,what interview! leave me alone,what interview! leave me alone,what interview leave me alone,what interview leave me alone,what interview leave me alone
4,"Sons of ****, why couldn`t they put them on t...","sons of ****, why couldn`t they put them on t...","sons of ****, why couldn`t they put them on t...","sons of ****, why couldn`t they put them on t...",sons of why couldnt they put them on the rel...,sons of why couldnt they put them on the rel...,sons of why couldnt they put them on the rel...
...,...,...,...,...,...,...,...
27476,wish we could come see u on Denver husband l...,wish we could come see u on denver husband l...,wish we could come see u on denver husband l...,wish we could come see u on denver husband l...,wish we could come see u on denver husband l...,wish we could come see u on denver husband l...,wish we could come see u on denver husband l...
27477,I`ve wondered about rake to. The client has ...,i`ve wondered about rake to. the client has ...,i`ve wondered about rake to. the client has ...,i`ve wondered about rake to. the client has ...,ive wondered about rake to the client has ma...,ive wondered about rake to the client has ma...,ive wondered about rake to the client has ma...
27478,Yay good for both of you. Enjoy the break - y...,yay good for both of you. enjoy the break - y...,yay good for both of you. enjoy the break - y...,yay good for both of you. enjoy the break - y...,yay good for both of you enjoy the break you...,yay good for both of you enjoy the break you...,yay good for both of you enjoy the break you...
27479,But it was worth it ****.,but it was worth it ****.,but it was worth it ****.,but it was worth it ****.,but it was worth it,but it was worth it,but it was worth it
