Code: Use Pandas to read the CSV file and store them in a DataFrame object named badminton2.
Next, it will display the data frame.

In [1]:
import pandas as pd
badminton = pd.read_csv('badmintondata.csv', header='infer')

badminton

Unnamed: 0,HUMAN PLAYER POSITION (X) metres,HUMAN PLAYER POSITION (Y) metres,INITITAL VELOCITY OF SHUTTELCOCK(m/s),INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE),SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE),SHUTTLECOCK POSITIION IN AIR(X ) metres,SHUTTLECOCK POSITIION IN AIR(Y) metres,SHUTTLECOCK POSITIION IN AIR(Z) metres
0,4,1,10,40,0,4.075543,1.0,1.662912
1,4,1,10,40,0,4.152007,1.0,1.724866
2,4,1,10,40,0,4.228906,1.0,1.788139
3,4,1,10,40,0,4.302100,1.0,1.845245
4,4,1,10,40,0,4.376877,1.0,1.904128
...,...,...,...,...,...,...,...,...
72795,0,0,0,0,0,0.000000,0.0,0.000000
72796,0,0,0,0,0,0.000000,0.0,0.000000
72797,0,0,0,0,0,0.000000,0.0,0.000000
72798,0,0,0,0,0,0.000000,0.0,0.000000


Check Data Types

In [2]:
badminton.dtypes

HUMAN PLAYER POSITION (X) metres                 int64
HUMAN PLAYER POSITION (Y) metres                 int64
INITITAL VELOCITY OF SHUTTELCOCK(m/s)            int64
INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE)        int64
SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE)      int64
SHUTTLECOCK POSITIION IN AIR(X ) metres        float64
SHUTTLECOCK POSITIION IN AIR(Y) metres         float64
SHUTTLECOCK POSITIION IN AIR(Z) metres         float64
dtype: object

Identify if there is any Missing Data in the attributes

In [3]:
print(badminton.isnull().sum())

HUMAN PLAYER POSITION (X) metres               0
HUMAN PLAYER POSITION (Y) metres               0
INITITAL VELOCITY OF SHUTTELCOCK(m/s)          0
INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE)      0
SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE)    0
SHUTTLECOCK POSITIION IN AIR(X ) metres        0
SHUTTLECOCK POSITIION IN AIR(Y) metres         0
SHUTTLECOCK POSITIION IN AIR(Z) metres         0
dtype: int64


Initialize SHOT ID to each group

In [4]:
# Create boolean mask if all row values are 0
mask = (badminton == 0).all(axis=1)

# Identify where the 'all zeros' condition changes
changes = mask.ne(mask.shift())

# Cumulatively sum these changes, assign SHOT_ID to each group
badminton['SHOT_ID'] = changes.cumsum()

# For 'all zero' rows, assign SHOT_ID as 0
badminton.loc[mask, 'SHOT_ID'] = 0

Identify Duplicate Data

In [5]:
# Check whether each row is a duplicate of a previous row in the table
dups = badminton.duplicated()

dups

0        False
1        False
2        False
3        False
4        False
         ...  
72795     True
72796     True
72797     True
72798     True
72799     True
Length: 72800, dtype: bool

In [6]:
duplicate = badminton[badminton.duplicated()]

duplicate

Unnamed: 0,HUMAN PLAYER POSITION (X) metres,HUMAN PLAYER POSITION (Y) metres,INITITAL VELOCITY OF SHUTTELCOCK(m/s),INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE),SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE),SHUTTLECOCK POSITIION IN AIR(X ) metres,SHUTTLECOCK POSITIION IN AIR(Y) metres,SHUTTLECOCK POSITIION IN AIR(Z) metres,SHOT_ID
123,0,0,0,0,0,0.0,0.0,0.0,0
124,0,0,0,0,0,0.0,0.0,0.0,0
125,0,0,0,0,0,0.0,0.0,0.0,0
126,0,0,0,0,0,0.0,0.0,0.0,0
127,0,0,0,0,0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...
72795,0,0,0,0,0,0.0,0.0,0.0,0
72796,0,0,0,0,0,0.0,0.0,0.0,0
72797,0,0,0,0,0,0.0,0.0,0.0,0
72798,0,0,0,0,0,0.0,0.0,0.0,0


The results suggest there are 112715 duplicate rows in the dataset.

In [7]:
# Drop duplicate data
print('Number of rows before discarding duplicates = %d' % (badminton.shape[0]))
data2 = badminton.drop_duplicates()
print('Number of rows after discarding duplicates = %d' % (data2.shape[0]))

Number of rows before discarding duplicates = 72800
Number of rows after discarding duplicates = 39159


Remove rows that are all zeros

In [8]:
data2 = data2.loc[~(data2==0).all(axis=1)]

data2

Unnamed: 0,HUMAN PLAYER POSITION (X) metres,HUMAN PLAYER POSITION (Y) metres,INITITAL VELOCITY OF SHUTTELCOCK(m/s),INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE),SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE),SHUTTLECOCK POSITIION IN AIR(X ) metres,SHUTTLECOCK POSITIION IN AIR(Y) metres,SHUTTLECOCK POSITIION IN AIR(Z) metres,SHOT_ID
0,4,1,10,40,0,4.075543,1.000000,1.662912,1
1,4,1,10,40,0,4.152007,1.000000,1.724866,1
2,4,1,10,40,0,4.228906,1.000000,1.788139,1
3,4,1,10,40,0,4.302100,1.000000,1.845245,1
4,4,1,10,40,0,4.376877,1.000000,1.904128,1
...,...,...,...,...,...,...,...,...,...
72634,4,4,70,70,15,9.217823,5.398111,2.591053,727
72635,4,4,70,70,15,9.275177,5.413480,2.041410,727
72636,4,4,70,70,15,9.326363,5.427195,1.463594,727
72637,4,4,70,70,15,9.370879,5.439123,0.862842,727


Remove abnormal values (Value is Negative or Value = 0)

In [9]:
data2 = data2[(data2[data2.columns] >= 0).all(axis=1)]

data2

Unnamed: 0,HUMAN PLAYER POSITION (X) metres,HUMAN PLAYER POSITION (Y) metres,INITITAL VELOCITY OF SHUTTELCOCK(m/s),INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE),SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE),SHUTTLECOCK POSITIION IN AIR(X ) metres,SHUTTLECOCK POSITIION IN AIR(Y) metres,SHUTTLECOCK POSITIION IN AIR(Z) metres,SHOT_ID
0,4,1,10,40,0,4.075543,1.000000,1.662912,1
1,4,1,10,40,0,4.152007,1.000000,1.724866,1
2,4,1,10,40,0,4.228906,1.000000,1.788139,1
3,4,1,10,40,0,4.302100,1.000000,1.845245,1
4,4,1,10,40,0,4.376877,1.000000,1.904128,1
...,...,...,...,...,...,...,...,...,...
72634,4,4,70,70,15,9.217823,5.398111,2.591053,727
72635,4,4,70,70,15,9.275177,5.413480,2.041410,727
72636,4,4,70,70,15,9.326363,5.427195,1.463594,727
72637,4,4,70,70,15,9.370879,5.439123,0.862842,727


In [10]:
# Calculate Q1, Q3, and IQR
Q1 = data2.iloc[:, 1:].groupby(data2['SHOT_ID']).quantile(0.25)  # Group by SHOT_ID
Q3 = data2.iloc[:, 1:].groupby(data2['SHOT_ID']).quantile(0.75)  # Group by SHOT_ID
IQR = Q3 - Q1

# Duplicate the original dataframe for use in the next steps
df_temp = data2.copy()

# Calculate outlier mask for each row
for shot_id in data2['SHOT_ID'].unique():
    outlier_mask = ((df_temp[df_temp['SHOT_ID'] == shot_id].iloc[:, 1:] < (Q1.loc[shot_id] - 1.5 * IQR.loc[shot_id])) | 
                    (df_temp[df_temp['SHOT_ID'] == shot_id].iloc[:, 1:] > (Q3.loc[shot_id] + 1.5 * IQR.loc[shot_id]))).any(axis=1)
    df_temp.loc[df_temp['SHOT_ID'] == shot_id, 'outlier_mask'] = outlier_mask

# Identify outlier SHOT_IDs
outlier_shot_ids = df_temp[df_temp['outlier_mask']].SHOT_ID.unique()

# Filter out outliers by SHOT_ID
df_no_outliers = df_temp[~df_temp['SHOT_ID'].isin(outlier_shot_ids)].drop(columns='outlier_mask')

# Calculate and print number of rows removed
removed_rows_outliers = data2.shape[0] - df_no_outliers.shape[0]
print(f"\nData after removing outliers: (removed {removed_rows_outliers} rows)")

# Update data2 to df_no_outliers
data2 = df_no_outliers
data2

  outlier_mask = ((df_temp[df_temp['SHOT_ID'] == shot_id].iloc[:, 1:] < (Q1.loc[shot_id] - 1.5 * IQR.loc[shot_id])) |
  (df_temp[df_temp['SHOT_ID'] == shot_id].iloc[:, 1:] > (Q3.loc[shot_id] + 1.5 * IQR.loc[shot_id]))).any(axis=1)



Data after removing outliers: (removed 11468 rows)


Unnamed: 0,HUMAN PLAYER POSITION (X) metres,HUMAN PLAYER POSITION (Y) metres,INITITAL VELOCITY OF SHUTTELCOCK(m/s),INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE),SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE),SHUTTLECOCK POSITIION IN AIR(X ) metres,SHUTTLECOCK POSITIION IN AIR(Y) metres,SHUTTLECOCK POSITIION IN AIR(Z) metres,SHOT_ID
18800,4,1,10,60,0,4.049659,1.000000,1.685961,189
18801,4,1,10,60,0,4.100141,1.000000,1.770894,189
18802,4,1,10,60,0,4.147209,1.000000,1.852351,189
18803,4,1,10,60,0,4.197915,1.000000,1.935086,189
18804,4,1,10,60,0,4.246229,1.000000,2.013513,189
...,...,...,...,...,...,...,...,...,...
72634,4,4,70,70,15,9.217823,5.398111,2.591053,727
72635,4,4,70,70,15,9.275177,5.413480,2.041410,727
72636,4,4,70,70,15,9.326363,5.427195,1.463594,727
72637,4,4,70,70,15,9.370879,5.439123,0.862842,727


Check data that is served out of court

In [11]:
import numpy as np

# Define court dimensions
court_length = 13.4  # in metres
court_width = 5.18  # in metres

# Initialize 'OUT_OF_COURT_SHOTS' column
data2 = data2.assign(OUT_OF_COURT_SHOTS = np.nan)

# Assign values to 'OUT_OF_COURT_SHOTS' column
data2['OUT_OF_COURT_SHOTS'] = np.where((data2['SHUTTLECOCK POSITIION IN AIR(X ) metres'] > court_length) | 
                                     (data2['SHUTTLECOCK POSITIION IN AIR(Y) metres'] > court_width), 0, 1)

# Count the number of 'IN' and 'OUT' rows
in_court = data2['OUT_OF_COURT_SHOTS'].sum()
out_of_court = data2.shape[0] - in_court

print(f"Number of 'IN' rows: {in_court}")
print(f"Number of 'OUT' rows: {out_of_court}")
data2

Number of 'IN' rows: 27070
Number of 'OUT' rows: 620


Unnamed: 0,HUMAN PLAYER POSITION (X) metres,HUMAN PLAYER POSITION (Y) metres,INITITAL VELOCITY OF SHUTTELCOCK(m/s),INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE),SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE),SHUTTLECOCK POSITIION IN AIR(X ) metres,SHUTTLECOCK POSITIION IN AIR(Y) metres,SHUTTLECOCK POSITIION IN AIR(Z) metres,SHOT_ID,OUT_OF_COURT_SHOTS
18800,4,1,10,60,0,4.049659,1.000000,1.685961,189,1
18801,4,1,10,60,0,4.100141,1.000000,1.770894,189,1
18802,4,1,10,60,0,4.147209,1.000000,1.852351,189,1
18803,4,1,10,60,0,4.197915,1.000000,1.935086,189,1
18804,4,1,10,60,0,4.246229,1.000000,2.013513,189,1
...,...,...,...,...,...,...,...,...,...,...
72634,4,4,70,70,15,9.217823,5.398111,2.591053,727,0
72635,4,4,70,70,15,9.275177,5.413480,2.041410,727,0
72636,4,4,70,70,15,9.326363,5.427195,1.463594,727,0
72637,4,4,70,70,15,9.370879,5.439123,0.862842,727,0


Re-number SHOT_ID after removed abnormal data rows

In [12]:
# Create sorted list of unique shotIds
shotIds = sorted(data2['SHOT_ID'].unique())

# Map old shotIds to new shotIds
map_dict = {old_id: new_id for new_id, old_id in enumerate(shotIds, start=1)}

# Replace old shotIds with new shotIds
data2.loc[:, 'SHOT_ID'] = data2['SHOT_ID'].replace(map_dict)

# Reorder columns
data2 = data2[['SHOT_ID', 'HUMAN PLAYER POSITION (X) metres', 'HUMAN PLAYER POSITION (Y) metres',
         'INITITAL VELOCITY OF SHUTTELCOCK(m/s)', 'INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE)',
         'SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE)', 'SHUTTLECOCK POSITIION IN AIR(X ) metres',
         'SHUTTLECOCK POSITIION IN AIR(Y) metres', 'SHUTTLECOCK POSITIION IN AIR(Z) metres', 'OUT_OF_COURT_SHOTS']]

data2

Unnamed: 0,SHOT_ID,HUMAN PLAYER POSITION (X) metres,HUMAN PLAYER POSITION (Y) metres,INITITAL VELOCITY OF SHUTTELCOCK(m/s),INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE),SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE),SHUTTLECOCK POSITIION IN AIR(X ) metres,SHUTTLECOCK POSITIION IN AIR(Y) metres,SHUTTLECOCK POSITIION IN AIR(Z) metres,OUT_OF_COURT_SHOTS
18800,1,4,1,10,60,0,4.049659,1.000000,1.685961,1
18801,1,4,1,10,60,0,4.100141,1.000000,1.770894,1
18802,1,4,1,10,60,0,4.147209,1.000000,1.852351,1
18803,1,4,1,10,60,0,4.197915,1.000000,1.935086,1
18804,1,4,1,10,60,0,4.246229,1.000000,2.013513,1
...,...,...,...,...,...,...,...,...,...,...
72634,270,4,4,70,70,15,9.217823,5.398111,2.591053,0
72635,270,4,4,70,70,15,9.275177,5.413480,2.041410,0
72636,270,4,4,70,70,15,9.326363,5.427195,1.463594,0
72637,270,4,4,70,70,15,9.370879,5.439123,0.862842,0


In [13]:
data2.to_csv("badmintondata1_cleaned_data.csv", index=False)