In [None]:
# 1 Load Data
!wget https://www.dropbox.com/s/kqu004pn2xpg0tr/train_V2.csv
!wget https://www.dropbox.com/s/5rl09pble4g6dk1/test_V2.csv

In [None]:
# 2 Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import gc
import os
import sys
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression

In [4]:
# 3 Data Cleaning - Reduce size - As the amount of dataset is too big, we need to use a memory saving function which will help us to reduce the memory usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')
 
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train_data=pd.read_csv("train_V2.csv")
train_data= reduce_mem_usage(train_data)

In [None]:
test_data=pd.read_csv("/content/test_V2.csv")
test_data= reduce_mem_usage(test_data)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
# 3 Data cleaning - Check if any 'null' values
train_data.isna().any()

In [None]:
# Get the percentage for each column for null values
null_columns=pd.DataFrame({'Columns':train_data.isna().sum().index,'No. Null values':train_data.isna().sum().values,'Percentage':train_data.isna().sum().values/train_data.shape[0]})
print(null_columns)

In [None]:
# 4 Exploratory Data Analysis
train_data.describe()

In [None]:
# 4 Exploratory Data Analysis - Find the unique id, unique group id and match id 
print('unique Id=',train_data['Id'].nunique())
print('unique groupId=',train_data['groupId'].nunique())
print('unique matchId=',train_data['matchId'].nunique())

In [None]:
# 4 Exploratory Data Analysis - game modes in the game
train_data.groupby(["matchType"]).count()

In [None]:
# 4 Exploratory Data Analysis - Ideally matchtype should be three types only 1.Single 2.Duo 3.Squad but we find more in data. We need to address only three types
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
train_data.groupby('matchId')['matchType'].first().value_counts().plot.bar()

In [None]:
# 4 Exploratory Data Analysis - new_train_data = data with only three matchtype
new_train_data=train_data
def mapthematch(data):
  mapping = lambda y: 'solo' if ('solo' in y) else 'duo' if('duo' in y) or ('crash' in y) else 'squad'
  data['matchType'] = data['matchType'].apply(mapping)
  return(new_train_data)
data=mapthematch(new_train_data)
data.groupby('matchId')['matchType'].first().value_counts().plot.bar()

In [None]:
# 4 Exploratory Data Analysis - Find the Illegal match, WinPlaceperc is null and we will drop the '2744604' column
data[data['winPlacePerc'].isnull()]

In [26]:
data.drop(2744604, inplace=True)

In [None]:
data['matchDuration'].hist(bins=50)

In [None]:
# 4 Exploratory Data Analysis - matchtype based teamkills
d=data[['teamKills','matchType']]
d.groupby('matchType').hist(bins=80)

In [None]:
# 4 Exploratory Data Analysis - Normalize the columns
data['killsNormalization'] = data['kills']*((100-data['kills'])/100 + 1)
data['damageDealtNormalization'] = data['damageDealt']*((100-data['damageDealt'])/100 + 1)
data['maxPlaceNormalization'] = data['maxPlace']*((100-data['maxPlace'])/100 + 1)
data['matchDurationNormalization'] = data['matchDuration']*((100-data['matchDuration'])/100 + 1)
# Let’s compare the actual and normalized data:
New_normalized_column = data[['Id','matchDuration','matchDurationNormalization','kills','killsNormalization','maxPlace','maxPlaceNormalization','damageDealt','damageDealtNormalization']]
New_normalized_column.head()

In [None]:
# 5 Feature Engineering - Create new feature healsandboosts
data['healsandboostsfeature'] = data['heals'] + data['boosts']
data[['heals', 'boosts', 'healsandboostsfeature']].tail()

In [None]:
# 5 Feature Engineering - Create new feature totalDistancetravelled
data['totalDistancetravelled'] = data['rideDistance'] + data['walkDistance'] + data['swimDistance']
data[['rideDistance', 'walkDistance', 'swimDistance','totalDistancetravelled']].tail()

In [None]:
# 5 Feature Engineering - Create new feature headshot_rate
data['headshot_rate'] = data['headshotKills'] / data['kills']
data['headshot_rate'].head()

In [66]:
# 6 Data Split - Now we will split our training data into two parts for: Train the model (80%) Test the model (20%) for validation purpose we will use test_v2.csv
x=data[['killsNormalization', 'damageDealtNormalization','maxPlaceNormalization', 'matchDurationNormalization','healsandboostsfeature','totalDistancetravelled']]
#drop the target variable
y=data['winPlacePerc']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [58]:
linear=LinearRegression().fit(xtrain,ytrain)

In [61]:
y_pred=linear.predict(xtest)
y_pred

array([0.41503356, 0.78588353, 0.95530506, ..., 0.30601349, 0.37773769,
       0.90921515])

In [74]:
df1 = df.head(25)
df1.plot(kind='bar',figsize=(26,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

NameError: ignored