In [1]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 674 kB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[K     |████████████████████████████████| 465 kB 69.6 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [2]:
import numpy as np
import pandas as pd

import json
import re

from tqdm.notebook import tqdm
from konlpy.tag import Okt # komoran, hannanum, kkma, mecab
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd
df_lyrics = pd.read_excel('song_lyrics_labeled (1).xlsx')

X_train, X_test, y_train, y_test = train_test_split(df_lyrics['lyrics'], df_lyrics['love'], test_size=0.2, stratify=df_lyrics['love'], shuffle=True, random_state=0)

X_train[0], y_train[0]

('하루의 일을 끝내고 돌아가는거리엔 사람의 물결하늘엔 별이 하나 둘 반짝이면가로등 하나 둘 꽃 피네허공을 스치는 바람은 차고흐뭇한 마음은 애드베룬가벼운 발길 헤어질 때 인사는내일 또 다시 만납시다하루의 일을 끝내고 돌아가는거리엔 사람의 물결하늘엔 별이 하나 둘 반짝이면가로등 하나 둘 꽃 피네허공을 스치는 바람은 차고흐뭇한 마음은 애드베룬가벼운 발길 헤어질 때 인사는내일 또 다시 만납시다내일 또 다시 만납시다',
 0)

In [4]:
def preprocessing(review):
    okt = Okt()
    
    f = open('stop_list.txt')
    stop_words = f.read().split()
    
    # 1. 한글 및 공백을 제외한 문자 모두 제거.
    review_text = re.sub("[^가-힣\\s]", "", review)
    
    # 2. okt 객체를 활용해서 형태소 토큰화 + 품사 태깅
    word_review = okt.pos(review_text, stem=True)

    # 노이즈 & 불용어 제거
    word_review = [(token, pos) for token, pos in word_review if not token in stop_words and len(token) > 1]
    
    # 명사, 동사, 형용사 추출
    word_review = [token for token, pos in word_review if pos in ['Noun', 'Verb', 'Adjective', 'Adverb']]

    return word_review

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

#okt.nouns를 tokenizer로 이용
tfidf = TfidfVectorizer(tokenizer=preprocessing, max_features=1000, min_df=1, max_df=0.5) 

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
X_train_tfidf[0]

<1x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

train: 0.8, test: 0.2, shuffle = true, random_state = 0

TfidfVectorizer : max_features=2000, min_df=5, max_df=0.5

random forest : n_estimators = 100, max_depth=30, random_state = 0


# 사랑 (love)

Logistic Regression 
* Train set score: 0.884
* Test set score: 0.845

Random Forest
* Train set score: 0.988
* Test set score: 0.837

Naive Bayes
* Train set score: 0.856
* Test set score: 0.816

# 효 (filial)

Logistic Regression 
* Train set score: 0.989
*Test set score: 0.988

Random Forest
* Train set score: 1.000
* Test set score: 0.989

Naive Bayes
* Train set score: 0.993
* Test set score: 0.988

# 남녀평등 (equal)

Logistic Regression 
* Train set score: 0.996
* Test set score: 0.996

Random Forest
* Train set score: 1.000
* Test set score: 0.997

Naive Bayes
* Train set score: 0.998
* Test set score: 0.994

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_lyrics['lyrics'], df_lyrics['filial'], test_size=0.2, shuffle=True, random_state=0,stratify=df_lyrics['filial'])

#X_train[0], y_train[0]

In [28]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

#okt.nouns를 tokenizer로 이용
tfidf = TfidfVectorizer(tokenizer=preprocessing, max_features=3000, min_df=5, max_df=0.1) 

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [29]:
# 모델 1 : Logistic Regression 모형
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
log_clf.fit(X_train_tfidf, y_train)
print('Train set score: {:.3f}'.format(log_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(log_clf.score(X_test_tfidf, y_test)))

Train set score: 0.989
Test set score: 0.989


In [30]:
# 모델 2 : 트리 앙상블 모형
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators = 100, max_depth=100, random_state = 0, min_samples_split=2, min_samples_leaf=1)
model_rf.fit(X_train_tfidf, y_train)
print('Train set score: {:.3f}'.format(model_rf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(model_rf.score(X_test_tfidf, y_test)))

Train set score: 1.000
Test set score: 0.990


In [31]:
# 모델 3 : Naive Bayes 분류모형
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB(alpha=0.001)
NB_clf.fit(X_train_tfidf, y_train)
print('Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))

Train set score: 0.999
Test set score: 0.991


In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

bow = CountVectorizer(tokenizer=preprocessing, min_df=1, max_df=0.5) 

# X_train_bow = bow.fit(X_train)
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

In [39]:
# 모델 1 : Logistic Regression 모형
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
log_clf.fit(X_train_bow, y_train)
print('Train set score: {:.3f}'.format(log_clf.score(X_train_bow, y_train)))
print('Test set score: {:.3f}'.format(log_clf.score(X_test_bow, y_test)))

Train set score: 1.000
Test set score: 0.992


In [40]:
# 모델 2 : 트리 앙상블 모형
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators = 100, max_depth=100, random_state = 0, min_samples_split=2, min_samples_leaf=1)
model_rf.fit(X_train_bow, y_train)
print('Train set score: {:.3f}'.format(model_rf.score(X_train_bow, y_train)))
print('Test set score: {:.3f}'.format(model_rf.score(X_test_bow, y_test)))

Train set score: 1.000
Test set score: 0.990


In [41]:
# 모델 3 : Naive Bayes 분류모형
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB(alpha=0.01)
NB_clf.fit(X_train_bow, y_train)
print('Train set score: {:.3f}'.format(NB_clf.score(X_train_bow, y_train)))
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_bow, y_test)))

Train set score: 0.999
Test set score: 0.988


In [None]:
# df_lyrics.groupby('year').sum()

In [None]:
df_lyrics.groupby('year').sum()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,hope,money,love,filial,friend,equal,swear,nature,regret,memory,sf,food,confi,enjoy
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1964,465,465,487,6,1,17,1,0,0,0,8,7,2,0,0,1,0
1965,1246,1246,1572,6,1,10,2,0,0,0,8,2,3,0,0,3,0
1966,2945,2945,4061,4,0,13,3,1,2,0,6,10,0,0,0,3,2
1967,5355,5355,7300,2,0,21,0,0,0,0,5,18,2,0,0,2,1
1968,6460,6460,8594,5,1,19,1,0,1,0,7,13,1,0,0,0,0
1969,9180,9180,12225,10,1,20,1,1,0,0,6,11,4,0,0,1,0
1970,11205,11205,14967,3,0,23,4,1,0,0,5,12,4,0,0,1,1
1971,13230,13230,18003,4,0,19,3,0,0,0,6,12,6,0,0,0,3
1972,19053,19053,23225,9,0,12,3,3,0,0,15,7,11,0,0,0,3
1973,19984,19984,24094,7,1,15,1,0,0,0,11,10,5,0,0,1,2
