## 자연어 처리를 통한 성격 예측 : 데이터 확인 및 전처리

In [114]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
import warnings
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')               # Turn the warnings off.

### 1. 데이터 살펴보기

In [115]:
df = pd.read_csv("mbti_1.csv")
df.shape

(8675, 2)

In [116]:
df.head(5)

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [117]:
df.info()
# 결측치 존재하지 않는다.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [118]:
# 중복데이터 확인

df['posts'].nunique() # 중복데이터 없음

8675

### 2. 데이터 전처리

In [119]:
# 불용어사전 만들기
stopwords = stopwords.words("english")

In [120]:
# 불용어 추가
# stopwords += []

In [121]:
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [122]:
# 데이터프레임 유지 하에 전처리 진행

df['posts'] = df['posts'].apply(lambda x : re.sub(r'(https|http):\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', ' ', x))
df['posts'] = df['posts'].apply(lambda x : x.lower())
df['posts'] = df['posts'].apply(lambda x : re.sub(r'\W', ' ', x))
df['posts'] = df['posts'].apply(lambda x : re.sub(r'_', ' ', x))
df['posts'] = df['posts'].apply(lambda x : re.sub(r'\d+', ' ', x))
df['posts'] = df['posts'].apply(lambda x : re.sub(r'\s+', ' ', x))
df['posts'] = df['posts'].apply(lambda x : re.sub(r'^\s|\s$', '', x))

In [125]:
# stopwords 제거 
for i in range(len(df['posts'])):
    words = nltk.word_tokenize(df['posts'][i])                              
    words = [x for x in words if x not in stopwords] # 불용어 제거     
    words = [x for x in words if len(x) > 2] # 특정 길이 이하의 단어 제거
    df['posts'][i] = ' '.join(words)

In [126]:
df.head()

Unnamed: 0,type,posts
0,INFJ,enfp intj moments sportscenter top ten plays p...
1,ENTP,finding lack posts alarming sex boring positio...
2,INTP,good one course say know blessing curse absolu...
3,INTJ,dear intp enjoyed conversation day esoteric ga...
4,ENTJ,fired another silly misconception approaching ...


In [127]:
# csv파일로 따로 저장해둔다. 

df.to_csv("mbti_new.csv", index = False)

In [None]:
# 이후 wordcloud 작업이나 모델 학습을 위해 미리 말뭉치를 만들어 객체를 따로 저장해둔다. 

In [128]:
# 말뭉치 형태로 저장
corpus = []
for i in range(0, len(posts)):
    post = re.sub(r'(https|http):\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', ' ', posts[i]) # url 제거
    post = post.lower() # 소문자로 정규화
    post = re.sub(r'\W', ' ', post) # 특수문자나 공백, 숫자 등 제거
    post = re.sub(r'_', ' ', post)
    post = re.sub(r'\d+', ' ', post)
    post = re.sub(r'\s+', ' ', post)
    post = re.sub(r'^\s|\s$', '', post)
    words = nltk.word_tokenize(post)                              
    words = [x for x in words if x not in stopwords] # 불용어 제거     
    words = [x for x in words if len(x) > 2] # 특정 길이 이하의 단어 제거
    post = ' '.join(words)           
    
    corpus.append(post)  

In [132]:
# 확인

print(corpus[2])

good one course say know blessing curse absolutely positive best friend could amazing couple count yes could madly love case reconciled feelings thank link called loop stem current topic obsession deadly like stuck thoughts mind wanders circles feels truly terrible noticed peculiar vegetation look grass dozens different plant species imagine hundreds years later soil smiths never one ever often find spotting faces marble tiles wood year old sentence incredibly accurate beautiful description visited website last years whoever reads maybe even remembers highly doubt sit garden writing songs sing together dozens crickets playing acoustic guitar intp ish thread ever seen able look painting entire life knew picked human drawing background animation working right mars felt obligated make mark watneyx postcard read book started make comics turtle gordon unicorn chimes see two first stories intj recently started post comics two friends turtle gordon unicorn chimes posted stuff interested try i

In [130]:
# corpus 객체만 따로 저장

with open('corpus.pickle', 'wb') as file:
    pickle.dump(corpus, file)