<a href="https://colab.research.google.com/github/yonseimath/datascience-biginner-2022-kaggle-competitions/blob/feature%2Fyenakim/yenakim/AI4Code_EDA_as_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install mglearn

In [None]:
# General
import sys, warnings, time, os, copy, gc, re, random, json
import pickle as pkl
warnings.filterwarnings('ignore')
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
# pd.set_option("display.max_colwidth", 10000)
import seaborn as sns
sns.set()
from pandas.io.json import json_normalize
from pprint import pprint
from pathlib import Path
from tqdm import tqdm
tqdm.pandas()
from datetime import datetime, timedelta
from scipy import sparse
import mglearn

# Pre-Processing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
# from jrstc_util import cleanse_text_new, text_cleaning, clean # 아마 만들어야 하는 함수로 추정
from wordcloud import WordCloud, STOPWORDS
import umap

# Model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [None]:
# Configuration
DEBUG = False
PATH_INPUT = Path('../input/AI4Code')
SAMPLE_ID = '051d049a469e47'

if DEBUG:
    NUM_SAMPLE = 10
    # NUM_SAMPLE = 1000
    
else:
    NUM_SAMPLE = 2000

In [None]:
def dump_load(obj, fileName, mode):
    if mode == 'wb':
        with open(fileName, mode=mode) as f:
            pkl.dump(obj, f)
            
    elif mode == 'rb':
        with open(fileName, mode=mode) as f:
            x = pkl.load(f)
            
            return x
            
    else:
        print('Please give "wb" or "rb" as mode.')

In [None]:
def read_notes(path):
    df = pd.read_json(path,
                     dtype={'cell_type': 'category', 'source': 'str'}
                     )
    df = df.assign(id=path.stem).rename_axis('cell_id')
    
    return df

In [None]:
listTrainPaths = list((PATH_INPUT / 'train').glob('*.json'))[:NUM_SAMPLE]
listTrainNotes = [read_notes(path) for path in tqdm(listTrainPaths)]
dfTrain = pd.concat(listTrainNotes)
dfTrain = dfTrain.set_index('id', append=True)
dfTrain = dfTrain.swaplevel().sort_index(level='id', sort_remaining=False)
dfTrain

In [None]:
dfOrders = pd.read_csv((PATH_INPUT / 'train_orders.csv'), index_col='id', squeeze=True)
dfOrders = dfOrders.str.split()
dfOrders

In [None]:
dfSample = dfTrain.loc[SAMPLE_ID, :]
numCode = dfSample[dfSample['cell_type'] == 'code'].shape[0] # 코드 셀 개수
numMark = dfSample[dfSample['cell_type'] == 'markdown'].shape[0] # 마크다운 셀 개수
print(f'Notebook {SAMPLE_ID} has {numCode} code cells and {numMark} markdown cells. \n')
dfSample

In [None]:
listOrders = dfOrders.loc[SAMPLE_ID]
dfSample.loc[listOrders, :]

In [None]:
del dfSample
gc.collect()

# Column(markdown 앞뒤 셀) 추가

In [None]:
dfTrain['source'] = dfTrain['source'].progress_apply(cleanse_text_new) # text를 cleanse, 개인 파일

In [None]:
listID = set(dfTrain.reset_index()['id'].tolist()) # 노트북 id들의 집합

n = 1
for nbid in tqdm(listID):
    dfTemp = dfTrain.loc[nbid,:] # 노트북의 source
    listMD = dfTemp[dfTemp['cell_type'] == 'markdown'].reset_index()['cell_id'].to_list() # 마크다운 셀의 id만 모아서 list로
    listOrders = dfOrders.loc[nbid] # 노트북의 순서
    
    for mdid in listMD:
        pos = listOrders.index(mdid) # 몇 번째에 셀이 위치하는지
        
        if pos == 0: # 첫번째라면
            x = dfTemp.loc[listOrders[:2],:].T # 첫번째 + 두번째 셀
            x.columns = ['markdown','code2']
            x = x.drop('cell_type')
            x['code1'] = 'start' # 첫번째이므로 앞쪽 코드는 start
            x = x.reindex(columns=['code1', 'markdown','code2'])
            
        elif pos == (len(listOrders)-1): # 마지막이라면
            x = dfTemp.loc[listOrders[-2:],:].T
            x.columns = ['code1', 'markdown']
            x = x.drop('cell_type')
            x['code2'] = 'end'
            
        else: # 가운데 위치
            x = dfTemp.loc[listOrders[(pos-1):(pos+2)],:].T
            x.columns = ['code1', 'markdown','code2']
            x = x.drop('cell_type')
            
            
        if n == 1:
            dfTrue = x
            
        else:
            dfTrue = pd.concat([dfTrue, x], axis=0)
            
        n += 1
        
dfTrue['label'] = 'True'
dfTrue

In [None]:
dfFalse = dfTrue.copy()
dfFalse['markdown'] = dfTrue['markdown'].sample(frac=1) # 데이터 셔플
dfFalse['label'] = 'False'
dfFalse

In [None]:
dfAll = pd.concat([dfTrue, dfFalse], axis=0)
dfAll['textAll'] = dfAll['code1'] + ' ' + dfAll['markdown'] + ' ' + dfAll['code2']

# 코사인 유사도

In [None]:
vectCode1 = vectorizer.transform(dfAll['code1']) # 벡터로 바꿈
vectMD = vectorizer.transform(dfAll['markdown'])
vectCode2 = vectorizer.transform(dfAll['code2'])

In [None]:
for i in range(dfAll.shape[0]):
    cosSim1 = cosine_similarity(vectCode1[i], vectMD[i])
    cosSim2 = cosine_similarity(vectMD[i], vectCode2[i])
    cosSimRow = np.append(cosSim1, cosSim2).reshape(-1,2)
    if i == 0:
        cosSimAll = cosSimRow
    else:
        cosSimAll = np.concatenate([cosSimAll, cosSimRow], 0)

dfCosSim = pd.DataFrame(data=cosSimAll, columns=['cos_sim1', 'cos_sim2'])
dfAll = pd.concat([dfAll, dfCosSim], axis=1)

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(data=dfAll, x='cos_sim1', y='cos_sim2', hue='label')

In [None]:
dfAll.to_csv('dfAll.csv')
dfAll