In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter('ignore')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Memory reducing

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
base_path = '/content/drive/MyDrive/Colab Notebooks/이어드림스쿨/TIL(Today I Learned)/20220607/ML-project/data/'
data = reduce_mem_usage(pd.read_csv(base_path + 'train_V2.csv'))

Memory usage of dataframe is 983.90 MB
Memory usage after optimization is: 288.39 MB
Decreased by 70.7%


In [None]:
data = data.dropna()

    세연 -  matchType, damageDealt, boosts, heals, assists, revives column에 대해 이상치가 없다고 판단. 그대로 학습에 사용.

    석민 - winPoints, killPoints 결측치가 많아서 학습에서 제외(drop), numGroups와 maxPlace의 correlation과 VIF가 높게 관측. maxPlace를 학습에서 제회 (numGroups를 categorical value로 변환해서 학습에 사용.)

    
    승범 - kills (이상인거), weaponsAcquired, killStreaks 고민 필요

    상현 - matchType (각 타입마다), Distance와 vehicleDestroys의 경우 인사이트를 발굴하지 못 한다면 그냥 사용해도 될 듯
    윤아 - longestkill
    특이값 제거 및 이상치 범위 설정(상관계수의 증가율만 확인한거라 재확인 필요)

    headshotKills
    이상치 범위 설정(상관계수의 증가율만 확인한거라 재확인 필요)

    kill, killStreaks
    둘의 상관관계 0.8이고, killStreaks가 kill에 포함 -> killStreaks drop 

    DBNOs
    damageDealt 아직 못봄

## 1차 feature engineering에서 수정된 사항
    => killPoints, winPoints (결측치가 많이 발견되기 때문에 제거)
    => matchType을 categorical value로 변환하여 학습에 포함.
    => maxPlace, numGroups는 correlation, VIF가 높게 관측, numGroups를 학습에 사용 (categorical value로 변환하여 사용.)

In [None]:
## roadKills, killpoint, rankpoint, winpoint, maxplacce, matchDuration, teamKills (drop columns)


data = data.drop(columns=['Id', 'groupId', 'matchId', 'rankPoints', 'roadKills', 'matchDuration', 'teamKills', 'killPoints', 'winPoints', 'maxPlace'])

data.head(2)

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,killStreaks,longestKill,matchType,numGroups,revives,rideDistance,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc
0,0,0,0.0,0,0,0,60,0,0,0.0,squad-fpp,26,0,0.0,0.0,0,244.75,1,0.444336
1,0,0,91.5,0,0,0,57,0,0,0.0,squad-fpp,25,0,0.004501,11.039062,0,1434.0,5,0.640137


## 윤상현

In [None]:
# matchType2라는 칼럼을 생성
data['matchType2'] = pd.NaT

# row가 많아서 생기는 메모리 에러 무시하고 진행
# pd.set_option('mode.chainedassignment',  None)

## crash와 flare를 event mode로 처리할 때    채택
maplist = {
    'squad' : 'squad',
    'duo' : 'duo',
    'solo' : 'solo',
    'crash' : 'event',
    'flare' : 'event'
}

for i in maplist:
  data.matchType2[(data['matchType'].str.contains(i)==True)&(data['matchType2'].isnull())]=maplist.get(i)


# solo ⮕ 1, duo ⮕ 2, squad ⮕ 4, falre & crash ⮕ 0 으로 전환
def convert(x):
    if x == "squad":
        return 4
    elif x == "duo":
        return 2
    elif x == "solo":
        return 1
    else : 
        return 0

data['matchType2'] = data['matchType2'].map(convert)


# 기존의 matchType drop
data.drop(['matchType'], axis=1, inplace=True)

## 문석민

 - correlation을 관측한 결과 
    - winPoints : killPoints (유사도 아주 높음)
    - numGroups : maxPlace (유사도 아주 높음)
    - rankPoints - (winPoints, killPoints) (음의 유사도 아주 높음 : 반대성향)

 ##결측값의 비율을 알아보자
    \ 결측값(winPoints, killPoints 동일)이 2655647개(약 60%) 임을 알 수 있다.
    => target column과의 상관관계도 낮고 + 결측값의 비율이 너무 높기 때문에 해당 column들을 학습에서 제외 시키기로 결정!

In [None]:
## numGroups column을 3구간으로 나눠서 categorical value로 표현.
## < 40, 40 < 80, 80 <

data['cat_numGroups'] = 0

data.loc[data['numGroups'] < 40, 'cat_numGroups'] = 0
data.loc[(data['numGroups'] > 40) & (data['numGroups'] < 80), 'cat_numGroups'] = 1
data.loc[data['numGroups'] > 80, 'cat_numGroups'] = 2

data = data.drop(columns='numGroups')

In [None]:
data.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,killStreaks,longestKill,revives,rideDistance,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,matchType2,cat_numGroups
0,0,0,0.0,0,0,0,60,0,0,0.0,0,0.0,0.0,0,244.75,1,0.444336,4,0
1,0,0,91.5,0,0,0,57,0,0,0.0,0,0.004501,11.039062,0,1434.0,5,0.640137,4,0
2,1,0,68.0,0,0,0,47,0,0,0.0,0,0.0,0.0,0,161.75,2,0.775391,2,1
3,0,0,32.90625,0,0,0,75,0,0,0.0,0,0.0,0.0,0,202.75,3,0.166748,4,0
4,0,0,100.0,0,0,0,45,1,1,58.53125,0,0.0,0.0,0,49.75,2,0.1875,1,2


## 최윤아

In [None]:
data = data.drop(columns = 'killStreaks')
data.head(2)

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,longestKill,revives,rideDistance,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,matchType2,cat_numGroups
0,0,0,0.0,0,0,0,60,0,0.0,0,0.0,0.0,0,244.75,1,0.444336,4,0
1,0,0,91.5,0,0,0,57,0,0.0,0,0.004501,11.039062,0,1434.0,5,0.640137,4,0


In [None]:
data

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,kills,longestKill,revives,rideDistance,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,matchType2,cat_numGroups
0,0,0,0.00000,0,0,0,60,0,0.00000,0,0.000000,0.000000,0,244.7500,1,0.444336,4,0
1,0,0,91.50000,0,0,0,57,0,0.00000,0,0.004501,11.039062,0,1434.0000,5,0.640137,4,0
2,1,0,68.00000,0,0,0,47,0,0.00000,0,0.000000,0.000000,0,161.7500,2,0.775391,2,1
3,0,0,32.90625,0,0,0,75,0,0.00000,0,0.000000,0.000000,0,202.7500,3,0.166748,4,0
4,0,0,100.00000,0,0,0,45,1,58.53125,0,0.000000,0.000000,0,49.7500,2,0.187500,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,0,0,0.00000,0,0,0,74,0,0.00000,0,1292.000000,0.000000,0,1019.0000,3,0.178589,4,0
4446962,0,1,44.15625,0,0,0,69,0,0.00000,0,0.000000,0.000000,0,81.6875,6,0.293457,1,2
4446963,0,0,59.06250,0,0,0,66,0,0.00000,0,0.000000,2.183594,0,788.5000,4,0.481445,4,0
4446964,0,4,180.37500,1,1,2,11,2,98.50000,2,0.000000,0.000000,0,2748.0000,8,0.799805,4,0


In [None]:
data.to_csv(base_path + 'featured_train_2.csv', index=False)

### Feature engineering 


 - 상현 : matchType을 categorical value로 변환.
 - 윤아 : killStreaks column drop.
 - 석민 : winPoints, killPoints, maxPlace column drop, numGroups column categorical value로 변환.
 
  