In [1]:
import pandas as pd
import matplotlib.pylab as plt
import surprise
import numpy as np
from surprise import Reader
from surprise import Dataset

In [2]:
# Read csv
trade_df = pd.read_csv("./data/trade.csv", encoding="utf-8", parse_dates=['ISSUEDT'],
                       dtype={'HSCD': str, "QTY": float, 'BYRADDR2': str, 'SPLYADDR2': str})

In [3]:
trade_df = trade_df[['BYRORGNM1', 'SPLYORGNM1']]

In [4]:
df = trade_df.groupby(['BYRORGNM1', 'SPLYORGNM1']).agg('size').reset_index()

In [5]:
df.columns = ['byr', 'spl', 'trd_cnt']

In [6]:
df.groupby('byr').agg('size').sort_values().reset_index().groupby(0).agg('size').sum()

16618

In [10]:
df.groupby('byr').agg('size').sort_values().reset_index().groupby(0).agg('size').loc[50:].sum()

86

In [11]:
b = df['byr'].value_counts() >= 5
b = b[b].index.tolist()

In [12]:
s = df['spl'].value_counts() >= 5
s = s[s].index.tolist()

In [14]:
df = df.loc[(df['byr'].isin(b)) & (df['spl'].isin(s))].reset_index(drop=True)

In [16]:
df.head()

Unnamed: 0,byr,spl,trd_cnt
0,(유)브이피에이치아이,(주)오토탑,43
1,(유)브이피에이치아이,(주)평화발레오,3
2,(유)브이피에이치아이,(주)홍성브레이크,14
3,(유)브이피에이치아이,케이비와이퍼시스템 주식회사,13
4,(유)신한,강남제비스코(주),3


In [17]:
df.groupby('trd_cnt').agg('size').head()

trd_cnt
1    3159
2    2692
3    1051
4     910
5     426
dtype: int64

In [18]:
df.groupby('trd_cnt').agg('size').sum()

12981

In [24]:
df.groupby('trd_cnt').agg('size').loc[:10]

trd_cnt
1     3159
2     2692
3     1051
4      910
5      426
6      469
7      291
8      350
9      191
10     222
dtype: int64

In [20]:
df.groupby('trd_cnt').agg('size')[10:].sum()

3220

### TEST Rating before drop few ratings (trd_cnt)

In [None]:
df.groupby('trd_cnt').agg('size').sum()

In [None]:
df.groupby('trd_cnt').agg('size').loc[701:].sum()

In [None]:
import sys
rating = {1:[1,1], 2:[2,2], 3:[3,4], 4:[5,7], 5:[8,12], 
          6:[13,23], 7:[24,50], 8:[51,160], 9:[161,700], 10:[701,sys.maxsize]}

In [None]:
21477 + 13957 + 10439 + 6577 + 5284 + 4303 + 3603 + 2929 + 1634 + 589

In [None]:
for i in rating:
    df.loc[(df.trd_cnt >= rating[i][0]) & (df.trd_cnt <= rating[i][1]), 'trd_cnt'] = i    

In [None]:
df.head()

In [None]:
df.groupby('trd_cnt').agg('size').plot.bar()

In [None]:
df.head()

In [None]:
df.groupby('byr').agg('size').sort_values(ascending=False)[:10].plot()

### Make Surprise Data from Dataframe

In [None]:
reader = Reader(rating_scale=(1,10))

In [None]:
data = Dataset.load_from_df(df, reader)

In [None]:
from surprise.model_selection import cross_validate
bsl_options = {
    'method': 'als',
    'n_epochs': 5,
    'reg_u': 12,
    'reg_i': 5
}
algo = surprise.BaselineOnly(bsl_options)
cross_validate(algo, data)

## By AMT

In [None]:
trade_df[['BYRORGNM1', 'SPLYORGNM1', 'AMT', 'CUR']].head()

In [None]:
trade_df = trade_df[['BYRORGNM1', 'SPLYORGNM1', 'AMT', 'CUR']]

In [None]:
trade_df.CUR.drop_duplicates()

In [None]:
trade_df = trade_df.loc[(trade_df.CUR == "USD") | (trade_df.CUR == "KRW")]

In [None]:
trade_df.loc[trade_df.CUR == 'USD'].head()

In [None]:
trade_df.loc[trade_df.CUR == 'KRW'].head()

In [None]:
def calculate_krw(amt: float, cur: str) -> float:
    if cur == 'KRW':
        return amt
    elif cur == 'USD':
        return amt * 1244

In [None]:
trade_df['AMT_KRW'] = trade_df.apply(lambda x: calculate_krw(x['AMT'], x['CUR']), axis=1)

In [None]:
trade_df.loc[[0, 35800]]

In [None]:
df = trade_df[['SPLYORGNM1', 'BYRORGNM1', 'AMT_KRW']]

In [None]:
df.head()

In [None]:
df = df.groupby(['SPLYORGNM1', 'BYRORGNM1']).agg('sum').sort_values('AMT_KRW', ascending=False).reset_index()

In [None]:
df.head()

In [None]:
df.columns = ['spl', 'byr', 'krw']

In [None]:
df.shape

In [None]:
df_table = df.set_index(["spl", "byr"]).unstack()
df_table.shape

In [None]:
plt.imshow(df_table)
plt.grid(False)
plt.xlabel("byr")
plt.ylabel("spl")
plt.title("Trade Matrix")
plt.show()

In [None]:
list(df_table.index).index('POSCO')

In [None]:
val = '주식회사 포스코대우'
[(index, row.index(val)) for index, row in enumerate(list(df_table.columns)) if val in row]

In [None]:
df_table.iloc[13660:13670, 13740:13750]

In [None]:
df_table.count(axis=1).sort_values(ascending=False)[:100].plot.box

In [None]:
plt.imshow(df_table.loc[df_table.count(axis=1).sort_values(ascending=False)[:100].index])
plt.grid(False)
plt.xlabel("byr")
plt.ylabel("spl")
plt.title("Trade Matrix")
plt.show()

In [None]:
plt.imshow(df_table.iloc[13660:13670, 13740:13750])
plt.grid(False)
plt.xlabel("byr")
plt.ylabel("spl")
plt.title("Trade Matrix")
plt.show()

In [None]:
df_table.count(axis=0).sort_values(ascending=False)

## Make dataframe to surprise dataset

In [None]:
df.krw.max()

In [None]:
df.krw.min()

In [None]:
from surprise import Reader
from surprise import Dataset

In [None]:
reader = Reader(rating_scale)

In [None]:
df.head()

In [None]:
df.krw.max()

In [None]:
reader=Reader(rating_scale=(0, 5))

In [None]:
df

In [None]:
df.krw = df.krw / df.krw.max() * 5.0

In [None]:
data = Dataset.load_from_df(df[['spl', 'byr', 'krw']], reader=reader)

In [None]:
data.raw_ratings

In [None]:
from surprise.model_selection import KFold

bsl_options = {
    'method': 'als',
    'n_epochs': 500,
    'reg_u': 12,
    'reg_i': 5
}
algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()

In [None]:
from surprise.model_selection import cross_validate

sim_options = {'name': 'msd'}
algo = surprise.KNNBasic(sim_options=sim_options)
cross_validate(algo, data)["test_mae"].mean()

In [None]:
data_ml = surprise.Dataset.load_builtin('ml-100k')

In [None]:
data.raw_ratings

In [None]:
data_ml.raw_ratings

# Trade count

In [None]:
df = trade_df[['BYRORGNM1', 'SPLYORGNM1']]

In [None]:
df = df.groupby(['BYRORGNM1', 'SPLYORGNM1']).agg('size').reset_index()

In [None]:
df.head()

In [None]:
df.columns = ['byr', 'spl', 'trd_cnt']

In [None]:
df.head()

In [None]:
df.sort_values('trd_cnt').trd_cnt.reset_index(drop=True).plot()

In [None]:
len(df.loc[df.trd_cnt < 3])

In [None]:
len(df.loc[df.trd_cnt <= 14])

In [None]:
df.sort_values('trd_cnt').trd_cnt.reset_index(drop=True)[35434:58914].plot.box()

In [None]:
df.sort_values('trd_cnt').trd_cnt.reset_index(drop=True)[35434:58914].plot()

In [None]:
df = df.sort_values('trd_cnt')[35434:58914].reset_index(drop=True)

In [None]:
df.tail()

In [None]:
df.groupby('spl').agg('size').sort_values().plot()

In [None]:
df.groupby('spl').agg('size').sort_values().tail()

In [None]:
df.groupby('spl').agg('size').plot.box()