<a href="https://colab.research.google.com/github/yonseimath/datascience-biginner-2022-kaggle-competitions/blob/feature%2Fyounghyun/Kaggle_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRanker
from bisect import bisect

In [None]:
num_train = 10000   # train 횟수 지정

def read_notebook(path,id_name):
    return (
        pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=id_name)                #id는 id_name으로 할당
        .rename_axis('cell_id')            #축 이름을 cell_id로 대치
    )

paths = []           # path 이름의 빈 리스트 생성
directory = '../input/AI4Code/train'       #dir 경로 지정
for file in os.scandir(directory):      
    if file.is_file():
        paths.append(file.path)
    if len(paths) == num_train:          #paths 길이가 num_train을 넘으면 for 문 종료
        break
        
id_names = []
for name in paths:
    name = name.split('/')           #path의 name을 /로 split하여 name에 저장
    id_n = name[-1].split('.')       #path에서 마지막 요소를 .으로 나눠 id_n에 저장
    id_names.append(id_n[0])         #id_n에서 첫 번째 요소를 id_names에 append
    
print(id_names)
print(paths)

train_notebooks = []
for i in range(len(paths)):
    train_notebooks.append(read_notebook(paths[i],id_names[i]))
    
print(train_notebooks)

In [None]:
# print(type(train_notebooks[0]))
df = (
    pd.concat(train_notebooks)                #concat 함수를 사용하여 인덱스 합침
    .set_index('id', append=True)             # id 인덱스로 setting
    .swaplevel()                              
    .sort_index(level='id', sort_remaining=False)          #index 정렬
)
df.head()

In [None]:
df_orders = pd.read_csv('../input/AI4Code/train_orders.csv',index_col='id',squeeze=True).str.split()      #train_orders 파일 읽어오기

In [None]:
df_orders

In [None]:
df_orders_ = df_orders.to_frame().join(            #df_orders에 index별로 group by 하고 join하고 frame으로 저장
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),             
    how='right',                      #right로 join
)

def get_ranks(base, derived):
    return [base.index(d) for d in derived]       #base 인덱스 반환

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():                        #for문에 id_, cell_order, cell_id 변수 지정
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}   #ranks의 id_별로 key값과 value값 저장

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')                         
    .rename_axis('id')                    #axis를 id로 변경
    .apply(pd.Series.explode)             
    .set_index('cell_id', append=True)    #index를 cell_id로
)

df_ranks

In [None]:
df_ancestors = pd.read_csv('../input/AI4Code/train_ancestors.csv', index_col='id')           #ancestors 파일 read
df_ancestors

In [None]:
siz = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=siz, random_state=0)         #분할 반복 횟수를 1로 지정, 0.1 size로 splitter 변수 생성 & 인덱스 제공

# Split, keeping notebooks with a common origin (ancestor_id) together
ids = df.index.unique('id')                # 중복 없는 id만
ancestors = df_ancestors.loc[ids, 'ancestor_id']       #ancestor_id에서 중복 없는 id만
ids_train, ids_valid = next(splitter.split(ids, groups=ancestors))      
ids_train, ids_valid = ids[ids_train], ids[ids_valid]

df_train = df.loc[ids_train, :]
df_valid = df.loc[ids_valid, :]

In [None]:
# Training set
tfidf = TfidfVectorizer(min_df=0.01)                         #Tfidf 기능의 matrix로 변환
X_train = tfidf.fit_transform(df_train['source'].astype(str))  #df_train의 source를 tfidf로 변환하여 x_train에 저장
# Rank of each cell within the notebook
y_train = df_ranks.loc[ids_train].to_numpy()
# Number of cells in each notebook
groups = df_ranks.loc[ids_train].groupby('id').size().to_numpy()

In [None]:
# Add code cell ordering
X_train = sparse.hstack((
    X_train,
    np.where(                                                         #where 조건문 cell type이 code면 순서, 그렇지 않으면 0 출력
        df_train['cell_type'] == 'code',
        df_train.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))
print(X_train.shape)

In [None]:
model = XGBRanker(                              #XGBRanker 모델 생성
    min_child_weight=10,
    subsample=0.5,
    tree_method='hist',
)
model.fit(X_train, y_train, group=groups)

In [None]:
# Validation set
X_valid = tfidf.transform(df_valid['source'].astype(str))           #valid에 있는 source를 tfidf로 변환(중요한 단어에 가중치) 하여 x_valid에 저장
# The metric uses cell ids
y_valid = df_orders.loc[ids_valid]                                  

X_valid = sparse.hstack((
    X_valid,
    np.where(                                                          #where 조건문 cell type이 code이면 순서, 그렇지 않으면 0 출력
        df_valid['cell_type'] == 'code',
        df_valid.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [None]:
y_pred = pd.DataFrame({'rank': model.predict(X_valid)}, index=df_valid.index)      # X_valid를 예측한 것을 rank로 하고 index는 df_valid 인덱스로
y_pred = (
    y_pred
    .sort_values(['id', 'rank'])  # Sort the cells in each notebook by their rank.
                                  # The cell_ids are now in the order the model predicted.
    .reset_index('cell_id')  # Convert the cell_id index into a column.
    .groupby('id')['cell_id'].apply(list)  # Group the cell_ids for each notebook into a list.
)
y_pred.head(10)

In [None]:
nb_id = df_valid.index.get_level_values('id').unique()[8]         #중복 없는 id에서 9번째 인자 출력

display(df.loc[nb_id])
display(df.loc[nb_id].loc[y_pred.loc[nb_id]])


In [None]:
def count_inversions_slowly(ranks):
    inversions = 0
    size = len(ranks)
    for i in range(size):
        for j in range(i+1, size):
            if ranks[i] > ranks[j]:
                total += 1
    return total

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):  
        j = bisect(sorted_so_far, u)  
        inversions += i - j
        sorted_so_far.insert(j, u)  
    return inversions

def kendall_tau(ground_truth, predictions):
    total_inversions = 0  
    total_2max = 0  
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [None]:
y_dummy = df_valid.reset_index('cell_id').groupby('id')['cell_id'].apply(list)
kendall_tau(y_valid, y_dummy)                      #y_valid와 y_dummy 사이의 상관계수 출력

In [None]:
kendall_tau(y_valid, y_pred)                       #y_valid와 앞서 구한 y_pred간의 상관계수 출력

In [None]:
paths = []                                        #앞서 train set으로 진행했던 것들을 test data로 바꾸어 진행
directory = '../input/AI4Code/test'
for file in os.scandir(directory):
    if file.is_file():
        paths.append(file.path)
    if len(paths) == num_train:
        break
        
id_names = []
for name in paths:
    name = name.split('/')
    id_n = name[-1].split('.')
    id_names.append(id_n[0])
    
# print(id_names)
# print(paths)

test_notebooks = []
for i in range(len(paths)):
    test_notebooks.append(read_notebook(paths[i],id_names[i]))
    
print(test_notebooks[0])

In [None]:
# print(type(train_notebooks[0]))
df_test = (
    pd.concat(test_notebooks)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)
df_test.head()

In [None]:
X_test = tfidf.transform(df_test['source'].astype(str))
X_test = sparse.hstack((
    X_test,
    np.where(
        df_test['cell_type'] == 'code',
        df_test.groupby(['id', 'cell_type']).cumcount().to_numpy() + 1,
        0,
    ).reshape(-1, 1)
))

In [None]:
y_infer = pd.DataFrame({'rank': model.predict(X_test)}, index=df_test.index)
y_infer = y_infer.sort_values(['id', 'rank']).reset_index('cell_id').groupby('id')['cell_id'].apply(list)
y_infer

In [None]:
y_submit = (
    y_infer
    .apply(' '.join)  # list of ids -> string of ids
    .rename_axis('id')
    .rename('cell_order')
)
y_submit

In [None]:
y_submit.to_csv('submission.csv')