In [1]:
import glob

files = glob.glob('./kbo/*/*/*/*.json')

In [2]:
import json
def read_json_file(path_to_file):
    with open(path_to_file) as p:
        rows=[json.loads(line) for line in p]
    return rows

In [3]:
import pandas as pd

data={}
for path in files:
    team = path.split('/')[4]
    if team not in data:
        data[team] = read_json_file(path)
    else :
        data[team].extend(read_json_file(path))

#df = pd.DataFrame(data)

In [4]:
for team in data.keys() :
    print("%s is %d" %(team, len(data[team])))

SK is 18997
LG is 21742
SS is 19993
OB is 31660
WO is 21672
HT is 20743
KT is 13075
NC is 19235
HH is 24775
LT is 19082


In [5]:
dict_of_df = {k: pd.DataFrame(v) for k,v in data.items()}
df = pd.concat(dict_of_df, axis=0)

In [6]:
df.reset_index(level=0, inplace=True)

In [7]:
len(df)

210974

In [8]:
df.to_pickle('kbo_raw.pkl')

In [3]:
import pandas as pd
df = pd.read_pickle('kbo_raw.pkl')

In [4]:
len(df)

210974

In [5]:
df = df[df['datetime']>'20150801235959']
df['datetime'] = pd.to_datetime(df['datetime'])

In [6]:
grouped=df.groupby(pd.Grouper(key='datetime', freq='M'))
group_sample=grouped.level_0.value_counts()

In [7]:
unstacked=group_sample.unstack(level=-1)
unstacked.reset_index(level=0, inplace=True)
unstacked

level_0,datetime,HH,HT,KT,LG,LT,NC,OB,SK,SS,WO
0,2015-08-31,1919,875,677,812,775,1056,1359,964,1040,1472
1,2015-09-30,1439,954,502,807,991,860,1099,1090,1025,1255
2,2015-10-31,309,291,180,158,581,1651,4340,506,1685,1854
3,2015-11-30,347,158,244,280,788,482,1137,261,499,791
4,2015-12-31,426,254,220,197,424,250,416,378,499,454
5,2016-01-31,467,299,208,317,261,283,493,299,427,307
6,2016-02-29,536,427,318,323,315,174,387,387,314,288
7,2016-03-31,752,554,586,571,581,390,623,643,890,676
8,2016-04-30,1712,818,824,1067,1149,709,1577,1076,924,1116
9,2016-05-31,1838,966,574,1052,970,688,1895,750,855,996


In [8]:
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

output_notebook()

In [9]:
p1 = figure(plot_width=750, plot_height=500,x_axis_type='datetime')
p1.grid.grid_line_alpha=0.3
p1.xaxis.axis_label = 'Date'
p1.yaxis.axis_label = 'Count'
p1.line(unstacked['datetime'], unstacked['HH'], color='#fba330', legend='한화',line_width=2)
p1.line(unstacked['datetime'], unstacked['LT'], color='#e86d25', legend='롯데',line_width=2)
p1.line(unstacked['datetime'], unstacked['OB'], color='#2e297c', legend='두산',line_width=2)
p1.line(unstacked['datetime'], unstacked['HT'], color='#cc1f29', legend='기아',line_width=2)
'''
p1.line(unstacked['datetime'], unstacked['NC'], color='#1e91ca', legend='NC',line_width=2)
p1.line(unstacked['datetime'], unstacked['LG'], color='#ea2361', legend='LG',line_width=2)
p1.line(unstacked['datetime'], unstacked['KT'], color='grey', legend='KT',line_width=2)
p1.line(unstacked['datetime'], unstacked['SS'], color='#137dbe', legend='삼성',line_width=2)
p1.line(unstacked['datetime'], unstacked['SK'], color='red', legend='SK',line_width=2)
p1.line(unstacked['datetime'], unstacked['WO'], color='#e91d27', legend='넥센',line_width=2)
'''
show(p1)

In [10]:
df.drop(['url','org_url','desc'], axis = 1, inplace = True)

In [11]:
df['title']=df['title'].str.replace(r"[^0-9a-zA-Z가-힣,.!?@=<>()\[\]{} ]+", " ")

In [12]:
len(df)

198064

In [13]:
docs=df['title']+ '. ' + df['contents']

In [14]:
# 문장 내용과 관계없는 특수문자 제거
test=docs.str.replace(r"[^0-9a-zA-Z가-힣=,.:!?@<>()/\-\[\]{} ]+", " ")
# 이메일 제거 및 url 제거 #\([a-z.:/ ]+\)|
test=test.str.replace(r"([0-9a-zA-Z.]+@[0-9a-zA-Z.]+)|[0-9a-zA-Z.]+.kr|[0-9a-zA-Z.]+.com","")
test=test.str.replace(r"http:[/]+","")
#remove byline
test=test.str.replace(r"\[[0-9a-zA-Z가-힣/=()!|,. ]+\]|\([0-9a-zA-Z가-힣()!|,. ]+=[0-9a-zA-Z가-힣()!|,. ]+\)", " ")
#비어있는 () 및 언론사 종특 제거
test=test.str.replace(r"\(\)[. ]|마이데일리.|조이뉴스24.", " ")
#알파벳으로 끝나는 문장 제거
#test=test.str.replace(r"[a-zA-Z ]+[,.]", "")
#test=test.str.replace(r"([0-9a-zA-Z.]+@[0-9a-zA-Z.]+)|[0-9a-zA-Z.]+[.kr]|[0-9a-zA-Z.]+[.com]","")
#sentences=test.str.replace(r"\[[0-9a-zA-Z가-힣()|,. ]+\]|([0-9a-zA-Z.]+@[0-9a-zA-Z.]+)
#|[0-9a-zA-Z.]+[.kr]|[0-9a-zA-Z.]+[.com]|[0-9a-zA-Z.]+@|\([a-z.:/ ]+\)|마이데일리.|조이뉴스24.|[a-zA-Z,. ]+\.|기사제공 [0-9a-zA-Z가-힣 .]+", " ")

## 문장추출

In [15]:
sentences=test.str.findall(r"[0-9a-zA-Z가-힣%,()\- ]+\. ")

In [16]:
len(sentences)

198064

In [17]:
%%time
from konlpy.tag import Mecab; m = Mecab()
pos = lambda d: ['/'.join(p) for p in m.pos(d)]

#[item for sublist in l for item in sublist]  
sentence_pos=[pos(sentence) for doc in sentences for sentence in doc] 

CPU times: user 7min 18s, sys: 7.31 s, total: 7min 25s
Wall time: 7min 26s


In [18]:
%%time
from gensim import corpora
dictionary_ko = corpora.Dictionary(sentence_pos)
dictionary_ko.save('kbo.dict')

CPU times: user 2min 21s, sys: 1.7 s, total: 2min 23s
Wall time: 2min 23s


In [19]:
%%time
from gensim.models import word2vec
wv_model_ko = word2vec.Word2Vec(sentence_pos, workers=4, size=500, min_count=10)
wv_model_ko.init_sims(replace=True)
wv_model_ko.save('kbo_word2vec.model')

CPU times: user 2h 7min 50s, sys: 3.32 s, total: 2h 7min 53s
Wall time: 32min 17s


In [85]:
wv_model_ko.most_similar(positive=pos('최고 투수'))

[('특급/NNG', 0.5562406778335571),
 ('에이스/NNG', 0.49179181456565857),
 ('최강/NNG', 0.44417691230773926),
 ('강타자/NNG', 0.4441549479961395),
 ('수준급/NNG', 0.4392955005168915),
 ('정상급/NNG', 0.4291062355041504),
 ('톱클래스/NNG', 0.4206096827983856),
 ('최저/NNG', 0.416382372379303),
 ('정통파/NNG', 0.41480788588523865),
 ('선수/NNG', 0.4052785634994507)]

In [65]:
wv_model_ko.most_similar_cosmul(pos('홍성흔'))

[('이호준/NNP', 0.8113997578620911),
 ('박세혁/NNP', 0.7877517938613892),
 ('최주환/NNP', 0.7860463261604309),
 ('김재환/NNP', 0.785500705242157),
 ('고영민/NNP', 0.7813785672187805),
 ('오재원/NNP', 0.7793625593185425),
 ('이승엽/NNP', 0.7776396870613098),
 ('오재일/NNP', 0.7773009538650513),
 ('최준석/NNP', 0.7765713930130005),
 ('민병헌/NNP', 0.7725090980529785)]

In [120]:
model_vocab = [ word for word in wv_model_ko.wv.vocab.keys() if word.split('/')[1].startswith('NN')]
len(model_vocab)

17651

In [132]:
from sklearn.manifold import TSNE
import numpy as np

vectors = [wv_model_ko[word] for word in model_vocab[:1000]]

perplexity = 15
learning_rate = 400

tsne = TSNE(n_components=2, perplexity=perplexity, learning_rate=learning_rate, random_state=666)
np.set_printoptions(suppress=True)

vectors2d = tsne.fit_transform(vectors)

In [133]:
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label


source = ColumnDataSource(data=dict(height=vectors2d[:,0],
                                    weight=vectors2d[:,1],
                                    names=model_vocab[:1000]))

In [135]:
p2 = figure(plot_width=900, plot_height=900,title='KBO Word2Vec')
p2.scatter(x='weight', y='height', size=2, source=source)

labels = LabelSet(x='weight', y='height', text='names', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', text_font_size='7pt',text_alpha=0.8)

citation = Label(x=70, y=70, x_units='screen', y_units='screen', render_mode='css',
                 border_line_color='black', border_line_alpha=1.0,
                 background_fill_color='white', background_fill_alpha=1.0)

p2.add_layout(labels)
p2.add_layout(citation)

show(p2)

In [90]:
df['title']=df['title'].str.replace(r"[^0-9a-zA-Z가-힣,.!?@=<>()\[\]{} ]+", " ")
df['contents']=df['contents'].str.replace(r"[^0-9a-zA-Z가-힣,.!?@=<>()\[\]{} ]+|[0-9a-zA-Z.]+.kr|[0-9a-zA-Z.]+.com|[0-9a-zA-Z.]+@[a-zA-Z.]+|[0-9a-zA-Z.]+@|\([a-z.:/]+\)|마이데일리.|조이뉴스24.|[a-zA-Z,. ]+\.|기사제공 [0-9a-zA-Z가-힣 .]", " ")

In [96]:
df['contents']=df['contents'].str.findall(r"[0-9a-zA-Z가-힣,.!?<>() ]+\. ").apply(lambda x: ' '.join(x))