In [1]:
import numpy as np


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import os
import matplotlib.dates as mdates

pd.set_option('display.max_columns',None)

In [2]:
df_train = pd.read_csv("/Users/krc/Downloads/pubg-finish-placement-prediction/train_V2.csv")

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

def preprocess(df):
    df = __delete_nan_data(df)
    new_col_name = "match_types"
    df[new_col_name] = __convert_match_type_column(df,"matchType")
    df = __change_nan_points(df)
    df = __one_hot_encode_data_frame(df, new_col_name)
    df = __select_features(df)
    return df

  
def __delete_nan_data(df):
    return df.dropna()

  
def __convert_match_type_column(prepro_df,encoding_feature):
    encoded = prepro_df[encoding_feature].agg(preprocessing_match_type)
    return encoded

  
def preprocessing_match_type(match_type):
    standard_matches = ["solo", "duo", "squad", "solo-fpp", "duo-fpp", "squad-fpp"]
    if match_type in standard_matches:
        return match_type
    else:
        return "others" 

      
def __change_nan_points(df):
    kill_rank_win_points = ["killPoints", "rankPoints", "winPoints"]
    match_types_list = list(df.match_types.unique())
    for col in kill_rank_win_points:
        if col != "rankPoints":
            cond0 = df[col] == 0
            cond1 = df[col] != 0
        else:
            cond0 = df[col] == -1
            cond1 = df[col] != -1
        for m_type in match_types_list:
            cond2 = df.match_types == m_type
            mean = df[cond1 & cond2][col].mean()
            std = df[cond1 & cond2][col].std()
            size = df[cond0 & cond2][col].count()
            if m_type != 'others' or col == "rankPoints":
                rand_points = np.random.randint(mean-std, mean+std, size=size)
            else:
                rand_points = np.array([mean]*size)
            df[col].loc[cond0 & cond2] = rand_points
    return df

  
def __one_hot_encode_data_frame(df, encoding_feature):
    df = pd.get_dummies(df, columns=[encoding_feature])
    return df


def __select_features(df):
    main_columns = ["winPlacePerc", "walkDistance", "boosts", "weaponsAcquired"]
    kill_columns = ["kills", "damageDealt"]
    match_type_columns = df.columns[df.columns.str.contains("match_types")]
    deleted_columns = list(set(df.columns)-set(main_columns)-set(kill_columns)-set(match_type_columns))
    return df.drop(columns=deleted_columns)


def poly_reg(df):

    X = df.drop(columns='winPlacePerc')
    y = df.winPlacePerc

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0xC0FFEE)
    poly = PolynomialFeatures(degree=2, include_bias=True)

    X_train_poly = poly.fit_transform(X_train)

    lin_reg = LinearRegression()
    lin_reg.fit(X_train_poly, y_train)

    

    X_val_poly = poly.transform(X_val)

    y_pred = lin_reg.predict(X_val_poly)

    mae_train_p = mean_absolute_error(y_val, y_pred)
    print(mae_train_p)

    
    
print(df_train)
print(df_train.columns)
df_train = preprocess(df_train)
print(df_train)
print(df_train.columns)
poly_reg(df_train)
print(poly_reg(df_train))

                     Id         groupId         matchId  assists  boosts  \
0        7f96b2f878858a  4d4b580de459be  a10357fd1a4a91        0       0   
1        eef90569b9d03c  684d5656442f9e  aeb375fc57110c        0       0   
2        1eaf90ac73de72  6a4a42c3245a74  110163d8bb94ae        1       0   
3        4616d365dd2853  a930a9c79cd721  f1f1f4ef412d7e        0       0   
4        315c96c26c9aac  de04010b3458dd  6dc8ff871e21e6        0       0   
...                 ...             ...             ...      ...     ...   
4446961  afff7f652dbc10  d238e426f50de7  18492834ce5635        0       0   
4446962  f4197cf374e6c0  408cdb5c46b2ac  ee854b837376d9        0       1   
4446963  e1948b1295c88a  e26ac84bdf7cef  6d0cd12784f1ab        0       0   
4446964  cc032cdd73b7ac  c2223f35411394  c9c701d0ad758a        0       4   
4446965  0d8e7ed728b6fd  8c74f72fedf5ff  62a16aabcc095c        0       2   

         damageDealt  DBNOs  headshotKills  heals  killPlace  killPoints  \
0          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col_name] = __convert_match_type_column(df,"matchType")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col].loc[cond0 & cond2] = rand_points


         boosts  damageDealt  kills  walkDistance  weaponsAcquired  \
0             0         0.00      0        244.80                1   
1             0        91.47      0       1434.00                5   
2             0        68.00      0        161.80                2   
3             0        32.90      0        202.70                3   
4             0       100.00      1         49.75                2   
...         ...          ...    ...           ...              ...   
4446961       0         0.00      0       1019.00                3   
4446962       1        44.15      0         81.70                6   
4446963       0        59.06      0        788.70                4   
4446964       4       180.40      2       2748.00                8   
4446965       2       268.00      2       1244.00                5   

         winPlacePerc  match_types_duo  match_types_duo-fpp  \
0              0.4444                0                    0   
1              0.6400            