In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
df = pd.read_csv("pubg.csv")

# --- Initial Exploration ---
print("Shape of data:", df.shape)
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())

In [None]:
# --- Drop unneeded IDs (they don’t help in analysis) ---
df.drop(columns=["Id", "groupId", "matchId"], inplace=True) # inplace mean in main data set

# --- Check for impossible values ---
# Example: negative values for kills, distance, or heals don’t make sense
num_cols = df.select_dtypes(include=[np.number]).columns
df = df[(df[num_cols] >= 0).all(axis=1)]

# --- Handle missing values ---
# Fill missing numeric values with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())


In [None]:
# --- Convert categorical to lowercase for consistency ---
df["matchType"] = df["matchType"].str.lower()

# Convert all numeric columns that might be read as objects
df["damageDealt"] = pd.to_numeric(df["damageDealt"], errors="coerce")

# Drop anything if remaining
df.dropna(inplace=True)

In [None]:
# --- Final check ---
print("\nCleaned Data Info:")
print(df.info())
print("\nAny remaining nulls:", df.isnull().sum().sum()) # .sum().sum() for getting nulls in whole data set


In [None]:
# PART 2 — FEATURE ENGINEERING

# totalDistance = sum of walking + riding + swimming
df["totalDistance"] = df["walkDistance"] + df["rideDistance"] + df["swimDistance"]

# headshotRate = headshotKills / totalKills (avoid division by 0)
df["headshotRate"] = np.where(df["kills"] > 0, df["headshotKills"] / df["kills"], 0)

# itemsUsed = heals + boosts
df["itemsUsed"] = df["heals"] + df["boosts"]

# --- Show new columns ---
print(df[["totalDistance", "headshotRate", "itemsUsed"]].head(25)) # change 25 for how many lines you want to see


In [None]:
# PART 3 — Exploratory Data Analysis

def scatter_plot(x, y, title, xlabel, ylabel , rotation=0):
    plt.scatter(df[x], df[y], alpha=0.4, color='blue')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.show()

scatter_plot("totalDistance", "winPlacePerc", "Travel vs Placement", "Distance", "Win %")



# Combat Performance Across Match Types
df.boxplot(column="kills", by="matchType")
plt.title("Kills Distribution by Match Type")
plt.xlabel("Match Type")
plt.ylabel("Number of Kills")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Top Players
top_players = df[df["winPlacePerc"] > 0.9]
avg_stats = top_players[["kills", "damageDealt", "boosts", "totalDistance", "itemsUsed"]].mean()

for feature in avg_stats.index:
    plt.figure(figsize=(5, 3))
    plt.bar(feature, avg_stats[feature], color='green')
    plt.title(f"Average {feature} of Top Players")
    plt.ylabel("Average Value")
    plt.grid(axis='y', linestyle='--', alpha=0.6)
    plt.show()


In [None]:
# Define player types
df["PlayerType"] = "Balanced"
df.loc[(df["kills"] > df["kills"].median()) & (df["damageDealt"] > df["damageDealt"].median()), "PlayerType"] = "Aggressive"
df.loc[(df["totalDistance"] > df["totalDistance"].median()) & (df["itemsUsed"] > df["itemsUsed"].median()), "PlayerType"] = "Survivalist"


# Compare win rates
grouped = df.groupby("PlayerType")["winPlacePerc"].mean()
print(grouped)  # see the numbers first
grouped.plot(kind='bar', color=['red','blue','green'])
plt.title("Average Win Percentile by Player Type")
plt.ylabel("Average Win %")
plt.show()

In [None]:
# Players who travel a lot but don’t heal much often and lose at end
plt.scatter(df["totalDistance"], df["itemsUsed"], c=df["winPlacePerc"], cmap='viridis', alpha=0.5)
plt.colorbar(label="Win Percentile")
plt.xlabel("Total Distance")
plt.ylabel("Items Used")
plt.title("Travel vs Item Usage vs Win Percentile")
plt.show()