<a href="https://colab.research.google.com/github/vanryuji/data_analysis/blob/master/qanda_review/word_2_vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install konlpy
!pip list

Package                  Version              
------------------------ ---------------------
absl-py                  0.7.1                
alabaster                0.7.12               
albumentations           0.1.12               
altair                   3.0.1                
astor                    0.8.0                
astropy                  3.0.5                
atari-py                 0.1.15               
atomicwrites             1.3.0                
attrs                    19.1.0               
audioread                2.1.8                
autograd                 1.2                  
Babel                    2.7.0                
backcall                 0.1.0                
backports.tempfile       1.0                  
backports.weakref        1.0.post1            
beautifulsoup4           4.6.3                
bleach                   3.1.0                
bokeh                    1.0.4                
boto                     2.49.0               
boto3        

In [0]:
import pandas as pd
import datetime
import logging
import plotly
import numpy as np
import plotly.graph_objs as go
import plotly.plotly as py
from konlpy.tag import Okt
from collections import Counter
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

# Download Qanda review

In [3]:
!rm -rf Medium_material
!git clone https://github.com/mathpresso/Medium_material.git
review_data = 'Medium_material/qanda_review/sample_201803.csv'
!ls -l

Cloning into 'Medium_material'...
remote: Enumerating objects: 9, done.[K
remote: Total 9 (delta 0), reused 0 (delta 0), pack-reused 9[K
Unpacking objects: 100% (9/9), done.
total 12
drwxr-xr-x 4 root root 4096 Jun 10 10:49 Medium_material
drwxr-xr-x 3 root root 4096 Jun 10 10:46 NanumBarunGothic
drwxr-xr-x 1 root root 4096 May 31 16:17 sample_data


# Load Qanda review

In [0]:
def change_to_datetime(submit_string):
    year = submit_string.split('-')[0]
    month = submit_string.split('-')[1]
    day = submit_string.split('-')[2].split('T')[0]
    
    return datetime.datetime(int(year), int(month), int(day))

In [5]:
origin = pd.read_csv(review_data)
origin['Date_Time'] = origin['Review Submit Date and Time'].apply(lambda info_str: change_to_datetime(info_str))
origin = origin[['Review Text', 'Review Title', 'Date_Time']]
origin.index = range(len(origin))
origin.head()

Unnamed: 0,Review Text,Review Title,Date_Time
0,,,2018-03-01
1,진짜좋군뇨,,2018-03-01
2,모르는 문제 있으면 바로바로 앱으로 알수있어서 완전좋습니다조아요 풀이도 이해가기쉽게...,,2018-03-01
3,진짜 좋은것 같아요!,,2018-03-01
4,조..조아여,,2018-03-01


# Word2Vector

In [0]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def get_processed_data(origin_df, start_date, end_date):
    
    query = origin_df[(origin_df.Date_Time >= start_date) & (origin_df.Date_Time <= end_date)]

    q = query[['Review Text', 'Date_Time']].dropna()
    q.columns = ['Reviews', 'Date_Time']
    q.index = range(len(q))

    parser = Okt()
    q['preprocess'] = q.Reviews.apply(lambda text : " ".join(parser.morphs(text)))
    cnt = Counter(word for line in q.preprocess for word in line.split())
    
    return q, cnt

In [0]:
# set credentials

# To obtain api key, sign up to https://plot.ly/
# Refer : https://plot.ly/products/cloud/

# TODO : remove personal key
username = 'your_plot_id'
api_key = 'your_plot_api_key'
plotly.tools.set_credentials_file(username=username, api_key=api_key)

In [0]:
# query

query, counter = get_processed_data(origin, '2011-08-01', '2018-10-10')

In [9]:
# embedding

embedding_model = Word2Vec([line.split() for line in query.preprocess], size=100, window=5, min_count=20, workers=4, iter=50, sg=1)

2019-06-10 10:49:23,722 : INFO : collecting all words and their counts
2019-06-10 10:49:23,729 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-06-10 10:49:23,739 : INFO : collected 5297 word types from a corpus of 38175 raw words and 3007 sentences
2019-06-10 10:49:23,740 : INFO : Loading a fresh vocabulary
2019-06-10 10:49:23,746 : INFO : effective_min_count=20 retains 281 unique words (5% of original 5297, drops 5016)
2019-06-10 10:49:23,747 : INFO : effective_min_count=20 leaves 26129 word corpus (68% of original 38175, drops 12046)
2019-06-10 10:49:23,750 : INFO : deleting the raw counts dictionary of 5297 items
2019-06-10 10:49:23,751 : INFO : sample=0.001 downsamples 89 most-common words
2019-06-10 10:49:23,752 : INFO : downsampling leaves estimated 15472 word corpus (59.2% of prior 26129)
2019-06-10 10:49:23,753 : INFO : estimated required memory for 281 words and 100 dimensions: 365300 bytes
2019-06-10 10:49:23,755 : INFO : resetting layer weight

In [10]:
# visualization

X = np.array([embedding_model.wv.get_vector(word) for word in embedding_model.wv.vocab.keys()])
voca = list(embedding_model.wv.vocab.keys())
X_embeded = TSNE(n_components=2).fit_transform(X)

trace2 = go.Scatter(
    x=[x for x, y in X_embeded],
    y=[y for x, y in X_embeded],
    mode='markers+text',
    name='Markers and Text',
    text=voca,
    textposition='bottom center'
)

data = [trace2]
layout = go.Layout(
    showlegend=False
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='text-chart-basic')

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~vanryuji/0 or inside your plot.ly account where it is named 'text-chart-basic'
