In [None]:
import pandas as pd
import matplotlib.pylab as plt
import surprise
import numpy as np

In [None]:
# Read csv
trade_df = pd.read_csv("./data/trade.csv", encoding="utf-8", parse_dates=['ISSUEDT'],
                       dtype={'HSCD': str, "QTY": float, 'BYRADDR2': str, 'SPLYADDR2': str})
trade_df.head()

In [None]:
trade_df.columns

In [None]:
trade_df[['BYRORGNM1', 'SPLYORGNM1', 'AMT', 'CUR']].head()

In [None]:
trade_df = trade_df[['BYRORGNM1', 'SPLYORGNM1', 'AMT', 'CUR']]

In [None]:
trade_df.CUR.drop_duplicates()

In [None]:
trade_df = trade_df.loc[(trade_df.CUR == "USD") | (trade_df.CUR == "KRW")]

In [None]:
trade_df.loc[trade_df.CUR == 'USD'].head()

In [None]:
trade_df.loc[trade_df.CUR == 'KRW'].head()

In [None]:
def calculate_krw(amt: float, cur: str) -> float:
    if cur == 'KRW':
        return amt
    elif cur == 'USD':
        return amt * 1244

In [None]:
trade_df['AMT_KRW'] = trade_df.apply(lambda x: calculate_krw(x['AMT'], x['CUR']), axis=1)

In [None]:
trade_df.loc[[0, 35800]]

In [None]:
df = trade_df[['SPLYORGNM1', 'BYRORGNM1', 'AMT_KRW']]

In [None]:
df.head()

In [None]:
df = df.groupby(['SPLYORGNM1', 'BYRORGNM1']).agg('sum').sort_values('AMT_KRW', ascending=False).reset_index()

In [None]:
df.head()

In [None]:
df.columns = ['spl', 'byr', 'krw']

In [None]:
df.shape

In [None]:
df_table = df.set_index(["spl", "byr"]).unstack()
df_table.shape

In [None]:
plt.imshow(df_table)
plt.grid(False)
plt.xlabel("byr")
plt.ylabel("spl")
plt.title("Trade Matrix")
plt.show()

In [None]:
list(df_table.index).index('POSCO')

In [None]:
val = '주식회사 포스코대우'
[(index, row.index(val)) for index, row in enumerate(list(df_table.columns)) if val in row]

In [None]:
df_table.iloc[13660:13670, 13740:13750]

In [None]:
df_table.count(axis=1).sort_values(ascending=False)[:100].plot.box

In [None]:
plt.imshow(df_table.loc[df_table.count(axis=1).sort_values(ascending=False)[:100].index])
plt.grid(False)
plt.xlabel("byr")
plt.ylabel("spl")
plt.title("Trade Matrix")
plt.show()

In [None]:
plt.imshow(df_table.iloc[13660:13670, 13740:13750])
plt.grid(False)
plt.xlabel("byr")
plt.ylabel("spl")
plt.title("Trade Matrix")
plt.show()

In [None]:
df_table.count(axis=0).sort_values(ascending=False)

## Make dataframe to surprise dataset

In [None]:
df.krw.max()

In [None]:
df.krw.min()

In [None]:
from surprise import Reader
from surprise import Dataset

In [None]:
reader = Reader(rating_scale)

In [None]:
df.head()

In [None]:
df.krw.max()

In [None]:
reader=Reader(rating_scale=(0, 5))

In [None]:
df

In [None]:
df.krw = df.krw / df.krw.max() * 5.0

In [None]:
data = Dataset.load_from_df(df[['spl', 'byr', 'krw']], reader=reader)

In [None]:
data.raw_ratings

In [None]:
from surprise.model_selection import KFold

bsl_options = {
    'method': 'als',
    'n_epochs': 500,
    'reg_u': 12,
    'reg_i': 5
}
algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)
for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()

In [None]:
from surprise.model_selection import cross_validate

sim_options = {'name': 'msd'}
algo = surprise.KNNBasic(sim_options=sim_options)
cross_validate(algo, data)["test_mae"].mean()

In [None]:
data_ml = surprise.Dataset.load_builtin('ml-100k')

In [None]:
data.raw_ratings

In [None]:
data_ml.raw_ratings