In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt  

from timeit import default_timer as timer
from sklearn import preprocessing

# !pip install ultimate
# from ultimate.mlp import MLP 

import gc, sys
gc.enable()


In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
base_path = '/content/drive/MyDrive/Colab Notebooks/이어드림스쿨/TIL(Today I Learned)/20220607/ML-project/data/Raw/'

In [None]:
def state(message,start = True, time = 0):
    if(start):
        print(f'Working on {message} ... ')
    else :
        print(f'Working on {message} took ({round(time , 3)}) Sec \n')

In [None]:
def feature_engineering_to_df(is_train=True):
    if is_train:
        print('processing train_V2.csv')
        df = reduce_mem_usage(pd.read_csv(base_path + 'train_V2.csv'))
        # 결측치 제거
        df = df.dropna()
        #+++++++++++++++++++++++++
        ## 최윤아
        ### 이상치 제거
        df = df.drop(df[df.headshotKills > 20].index)
        df = df.drop(df[(df.longestKill == 0) & (df.kills != 0)].index)
        ## 윤아님 처리 코드
        # walkDistance
        df = df.drop(df[df.walkDistance > 12000].index)
        df = df.drop(df[df.swimDistance > 2000].index)
        df = df.drop(df[df.rideDistance > 20000].index)
    else:
        print('processing test_V2.csv')
        df = reduce_mem_usage(pd.read_csv(base_path + 'test_V2.csv'))

    state('rankPoints')
    s = timer()
    # Process the 'rankPoints' feature by replacing any value of (-1) to be (0) :
    df['rankPoints'] = np.where(df['rankPoints'] <= 0 ,0 , df['rankPoints'])
    e = timer()                                  
    state('rankPoints', False, e-s)

    
    ## 만들고 지우는게 맞는지 물어보기
    df['playersJoined'] = df.groupby('matchId')['matchId'].transform('count')
    df['damageDealtNorm'] = df.damageDealt * ((100 - df.playersJoined)/100 + 1)
    df['killsNorm'] = df.kills * ((100 - df.playersJoined)/100 + 1)
    df['totalDistance'] = df['rideDistance'] + df['walkDistance'] + df['swimDistance']

    
    ## 윤상현
    ### matchType => categorical value
    df['matchType2'] = pd.NaT
    maplist = {
        'squad' : 'squad',
        'duo' : 'duo',
        'solo' : 'solo',
        'crash' : 'event',
        'flare' : 'event'
    }
    for i in maplist:
        df.matchType2[(df['matchType'].str.contains(i)==True)&(df['matchType2'].isnull())]=maplist.get(i)
    # solo ⮕ 1, duo ⮕ 2, squad ⮕ 4, falre & crash ⮕ 0 으로 전환
    def convert(x):
        if x == 'squad':
            return 4
        elif x == 'duo':
            return 2
        elif x == 'solo':
            return 1
        else : 
            return 0
    df['matchType2'] = df['matchType2'].map(convert)
    
    # matchType2 rename, reindex
    df.rename(columns = {'matchType2':'matchType'}, inplace = True)

    # vehicleDestroys Feature_engineering => categorical value

    # cat_vehicleDestroys 칼럼을 생성
    df['cat_vehicleDestroys'] = pd.NaT

    df.loc[df['vehicleDestroys'] == 0, 'cat_vehicleDestroys'] = 0
    df.loc[(df['vehicleDestroys'] >= 1) & (df['vehicleDestroys'] <= 2), 'cat_vehicleDestroys'] = 1
    df.loc[df['vehicleDestroys'] >= 3, 'cat_vehicleDestroys'] = 3

    df['cat_vehicleDestroys'] = df['cat_vehicleDestroys'].astype(int)

    # cat_vehicleDestroys 칼럼을 생성
    df['cat_weaponsAcquired'] = pd.NaT

    df.loc[df['weaponsAcquired'] == 0, 'cat_weaponsAcquired'] = 0
    df.loc[df['weaponsAcquired'] == 1, 'cat_weaponsAcquired'] = 1
    df.loc[(df['weaponsAcquired'] >= 2) & (df['weaponsAcquired'] <= 3), 'cat_weaponsAcquired'] = 2
    df.loc[(df['weaponsAcquired'] >= 4) & (df['weaponsAcquired'] <= 5), 'cat_weaponsAcquired'] = 4
    df.loc[(df['weaponsAcquired'] >= 6) & (df['weaponsAcquired'] <= 7), 'cat_weaponsAcquired'] = 6
    df.loc[(df['weaponsAcquired'] >= 8) & (df['weaponsAcquired'] <= 9), 'cat_weaponsAcquired'] = 8
    df.loc[df['weaponsAcquired'] >= 10, 'cat_weaponsAcquired'] = 10

    # destroy.drop(columns=['weaponsAcquired'])

    df['cat_weaponsAcquired'] = df['cat_weaponsAcquired'].astype(int)
                    


    ## 승범
    # df['killStreakrate'] = df['killStreaks']/df['kills']


    ## 윤아 & 승범
    if is_train:
        df['killsWithoutMoving'] = ((df['killsNorm'] > 0) & (df['totalDistance'] == 0))
        df = df.drop(df[df.killsWithoutMoving == True].index)
        df.loc[df.kills >= 8,'kills'] = 8


    ## 상현 처리 코드
    # walkDistance 피쳐
    df['cat_walkDistance'] = pd.NaT

    df.loc[df['walkDistance'] < 500, 'cat_walkDistance'] = 1
    df.loc[(df['walkDistance'] >= 500) & (df['walkDistance'] < 1000), 'cat_walkDistance'] = 2
    df.loc[(df['walkDistance'] >= 1000) & (df['walkDistance'] < 2000), 'cat_walkDistance'] = 3
    df.loc[(df['walkDistance'] >= 2000) & (df['walkDistance'] < 5000), 'cat_walkDistance'] = 4
    df.loc[df['walkDistance'] >= 5000, 'cat_walkDistance'] = 5

    df['cat_walkDistance'] = df['cat_walkDistance'].astype(int)

    ## rideAndswim Feature_engineering

    # sum_rideAndswim 칼럼을 생성
    df['sum_rideAndswim'] = pd.NaT
    df['sum_rideAndswim'] = df['rideDistance'] + df['swimDistance']

    # binary_rideAndswim 칼럼을 생성
    df['binary_rideAndswim'] = pd.NaT

    df.loc[df['sum_rideAndswim'] == 0, 'binary_rideAndswim'] = 0
    df.loc[df['sum_rideAndswim'] > 0, 'binary_rideAndswim'] = 1

    df['binary_rideAndswim'] = df['binary_rideAndswim'].astype(int)


    ## 세연
    # Create new feature heals&boosts
    df['heals_boosts'] = df['heals'] + df['boosts']
    

    # groupby matchId, groupId-----------------------------------------

    target = 'winPlacePerc'
    # Get a list of the features to be used
    y = None
    features = list(df.columns)

    # Remove some features from the features list :
    features.remove('Id')
    features.remove('matchId')
    features.remove('groupId')
    features.remove('matchDuration')
    features.remove('matchType')
    features.remove('killPlace')
    features.remove('maxPlace')
    features.remove('killPoints')
    features.remove('roadKills')
    features.remove('teamKills')
    features.remove('winPoints')
    features.remove('damageDealt')
    features.remove('kills')
    features.remove('rideDistance')
    features.remove('swimDistance')
    features.remove('vehicleDestroys')
    features.remove('walkDistance')
    features.remove('playersJoined')
    features.remove('weaponsAcquired')
    features.remove('heals')
    features.remove('boosts')
    features.remove('sum_rideAndswim')

    
    # If we are processing the training data, process the target
    # (group the data by the match and the group then take the mean of the target) 
    if is_train: 
        y = np.array(df.groupby(['matchId','groupId'])[target].agg('mean'), dtype=np.float64)
        # Remove the target from the features list :
        features.remove(target)
    
    # Make new features indicating the mean of the features ( grouped by match and group ) :
    print('get group mean feature')
    agg = df.groupby(['matchId','groupId'])[features].agg('mean')
    
    
    # If we are processing the training data let df_out = the grouped  'matchId' and 'groupId'
    if is_train: 
        df_out = agg.reset_index()[['matchId','groupId']]
    # If we are processing the test data let df_out = 'matchId' and 'groupId' without grouping 
    else: 
        df_out = df[['matchId','groupId']]

    # Merge agg and agg_rank (that we got before) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=['_mean'], how='left', on=['matchId', 'groupId'])
    # df_out = df_out.merge(agg_rank, suffixes=['_mean', '_mean_rank'], how='left', on=['matchId', 'groupId'])
    

    # Drop matchId and groupId
    df_out.drop(['matchId', 'groupId'], axis=1, inplace=True)

    # X is the output dataset (without the target) and y is the target :
    X = np.array(df_out, dtype=np.float64)
    
    
    # del df, df_out, agg
    # gc.collect()

    df_out['winPlacePerc'] = y

    return df_out

In [None]:
df_out = feature_engineering_to_df()

processing train_V2.csv
Memory usage of dataframe is 983.90 MB
Memory usage after optimization is: 288.39 MB
Decreased by 70.7%
Working on rankPoints ... 
Working on rankPoints took (0.026) Sec 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


get group mean feature


In [None]:
df_out.to_csv(base_path + 'featured_train_3(1).csv', index=False)

In [None]:
df_out

Unnamed: 0,assists,DBNOs,headshotKills,killStreaks,longestKill,numGroups,rankPoints,revives,damageDealtNorm,killsNorm,totalDistance,matchType,cat_vehicleDestroys,cat_weaponsAcquired,killsWithoutMoving,cat_walkDistance,binary_rideAndswim,heals_boosts,winPlacePerc
0,0.000000,1.000000,0.000000,0.500000,6.273438,28.0,0.000000,0.000000,115.155469,1.050000,131.750000,4.0,0.0,1.000000,0.0,1.000000,0.0,1.000000,0.333252
1,0.000000,0.333333,0.000000,0.000000,0.000000,28.0,0.000000,0.000000,50.386328,0.000000,19.937500,4.0,0.0,0.833333,0.0,1.000000,0.0,0.000000,0.036987
2,0.000000,0.000000,0.000000,0.000000,0.000000,28.0,0.000000,0.000000,0.000000,0.000000,4.816406,4.0,0.0,0.500000,0.0,1.000000,0.0,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,28.0,0.000000,0.000000,12.288281,0.000000,559.000000,4.0,0.0,6.000000,0.0,1.500000,0.0,0.500000,0.370361
4,1.000000,2.500000,1.000000,1.500000,25.218750,28.0,0.000000,0.000000,357.918750,3.150000,3378.000000,4.0,0.0,4.000000,0.0,4.000000,1.0,4.500000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023432,0.000000,0.000000,0.000000,1.000000,33.000000,29.0,1471.000000,0.000000,18.139844,1.070000,4720.000000,4.0,0.0,2.000000,0.0,4.000000,1.0,0.000000,0.643066
2023433,0.666667,1.333333,0.666667,1.000000,116.062500,29.0,1452.000000,1.333333,220.109310,1.426667,6324.000000,4.0,0.0,7.333333,0.0,3.666667,1.0,9.333333,0.928711
2023434,0.000000,0.166667,0.000000,0.166667,1.334961,29.0,1486.000000,0.000000,27.775417,0.178333,91.375000,4.0,0.0,0.666667,0.0,1.000000,0.0,0.000000,0.000000
2023435,0.000000,0.750000,0.000000,0.250000,0.544434,29.0,1366.250000,0.000000,63.932500,0.267500,292.250000,4.0,0.0,2.500000,0.0,1.250000,0.0,1.000000,0.250000
