In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
matches = pd.read_csv(r"C:\Users\91902\OneDrive\Desktop\cricsheet_match_analysis\exports\matches.csv")
deliveries = pd.read_csv(r"C:\Users\91902\OneDrive\Desktop\cricsheet_match_analysis\exports\deliveries.csv")
players = pd.read_csv(r"C:\Users\91902\OneDrive\Desktop\cricsheet_match_analysis\exports\players.csv")
teams = pd.read_csv(r"C:\Users\91902\OneDrive\Desktop\cricsheet_match_analysis\exports\match_players.csv")
matches.head()

Unnamed: 0,match_id,format,season,venue,team1,team2,toss_winner,toss_decision,winner,result
0,1000851,Test,2016/17,Western Australia Cricket Association Ground,Australia,South Africa,South Africa,bat,South Africa,
1,1000853,Test,2016/17,Bellerive Oval,Australia,South Africa,South Africa,field,South Africa,
2,1000855,Test,2016/17,Adelaide Oval,Australia,South Africa,South Africa,bat,Australia,
3,1000881,Test,2016/17,"Brisbane Cricket Ground, Woolloongabba",Australia,Pakistan,Australia,bat,Australia,
4,1000883,Test,2016/17,Melbourne Cricket Ground,Australia,Pakistan,Pakistan,bat,Australia,


In [3]:
deliveries.head()

Unnamed: 0,delivery_id,match_id,inning,over_num,ball_num,batsman_id,bowler_id,non_striker_id,runs_batsman,runs_extras,runs_total,dismissal_kind,player_dismissed_id
0,1,1000851,0,0,0,12,8,13,0,0,0,,
1,2,1000851,0,0,0,12,8,13,0,0,0,,
2,3,1000851,0,0,0,12,8,13,0,0,0,,
3,4,1000851,0,0,0,12,8,13,0,0,0,caught,12.0
4,5,1000851,0,0,0,14,8,13,0,0,0,,


In [4]:
players.head()

Unnamed: 0,player_id,name
0,4866,A Adekunle
1,2315,A Ahmadhel
2,4214,A Aikenhead
3,976,A Aitken
4,1795,A Aitken-Drummond


In [5]:
teams.head()

Unnamed: 0,match_id,player_id,role
0,1000851,1,player
1,1000851,2,player
2,1000851,3,player
3,1000851,4,player
4,1000851,5,player


In [7]:
import os   

In [8]:
# Ensure output directory exists
output_dir = "eda_outputs"
os.makedirs(output_dir, exist_ok=True)

# Set seaborn style
sns.set(style="whitegrid", palette="Set2")


In [17]:
# 1. Matches per Season
plt.figure(figsize=(12,6))
sns.countplot(x="season", data=matches, order=sorted(matches['season'].unique()))
plt.title("Matches Played per Season", fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "1_matches_per_season.png"))
plt.close()



In [18]:
# 2. Matches per Format
plt.figure(figsize=(8,6))
sns.countplot(x="format", data=matches, order=matches['format'].value_counts().index)
plt.title("Matches by Format", fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "2_matches_by_format.png"))
plt.close()


In [19]:
# 3. Most Common Venues
plt.figure(figsize=(12,6))
top_venues = matches['venue'].value_counts().head(10)
sns.barplot(x=top_venues.values, y=top_venues.index)
plt.title("Top 10 Venues by Matches", fontsize=14)
plt.xlabel("Number of Matches")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "3_top_venues.png"))
plt.close()



In [20]:
# 4. Toss Decision Counts
plt.figure(figsize=(6,6))
sns.countplot(x="toss_decision", data=matches)
plt.title("Toss Decisions (Bat/Field)", fontsize=14)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "4_toss_decisions.png"))
plt.close()



In [21]:
# 5. Toss Winners (Top 10 Teams)
plt.figure(figsize=(12,6))
top_toss = matches['toss_winner'].value_counts().head(10)
sns.barplot(x=top_toss.values, y=top_toss.index)
plt.title("Top 10 Teams Winning Tosses", fontsize=14)
plt.xlabel("Number of Toss Wins")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "5_top_toss_winners.png"))
plt.close()



In [22]:
# 6. Match Winners (Pie Chart)
plt.figure(figsize=(8,8))
winners = matches['winner'].value_counts().head(8)
plt.pie(winners.values, labels=winners.index, autopct="%1.1f%%", startangle=140)
plt.title("Match Wins Distribution (Top 8 Teams)")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "6_match_winners_pie.png"))
plt.close()



In [24]:
# 7. Dismissal Types (Horizontal Count Plot)
plt.figure(figsize=(10,6))
dismissals = deliveries['dismissal_kind'].dropna().value_counts()
sns.barplot(x=dismissals.values,y=dismissals.index,hue=dismissals.index,dodge=False,orient="h",palette="viridis",legend=False)
plt.title("Types of Dismissals")
plt.xlabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "7_dismissal_types.png"))
plt.close()



In [25]:
# 8. Runs per Ball Distribution (Histogram + KDE)
plt.figure(figsize=(10,6))
sns.histplot(deliveries['runs_batsman'], bins=7, kde=True, color="royalblue")
plt.title("Runs Scored per Ball")
plt.xlabel("Runs (0–6)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "8_runs_per_ball_dist.png"))
plt.close()



In [26]:
# 9. Heatmap: Toss Winner vs Match Winner
plt.figure(figsize=(10,6))
cross_tab = pd.crosstab(matches['toss_winner'], matches['winner'])
sns.heatmap(cross_tab, cmap="coolwarm", cbar=True, linewidths=0.5)
plt.title("Toss Winner vs Match Winner Heatmap")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "9_toss_vs_winner_heatmap.png"))
plt.close()


In [27]:
# 10. Average Runs per Over (Lineplot with Marker)
plt.figure(figsize=(12,6))
runs_per_over = deliveries.groupby("over_num")['runs_total'].mean()
sns.lineplot(x=runs_per_over.index, y=runs_per_over.values, marker="o", linewidth=2.5)
plt.title("Average Runs per Over")
plt.xlabel("Over Number")
plt.ylabel("Average Runs")
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "10_avg_runs_per_over.png"))
plt.close()

