In [1]:
# libraries
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling

In [57]:
dataset = pd.read_csv('/Users/apple/output.csv') # to import the dataset into a variable
# Splitting the attributes into independent and dependent attributes
X = dataset.iloc[:, :-1].values # attributes to determine dependent variable / Class
Y = dataset.iloc[:, -1].values # dependent variable / Class

In [58]:
X

array([[1, 'A Dijiang', 'M', ..., "Basketball Men's Basketball", nan,
        'China'],
       [2, 'A Lamusi', 'M', ..., "Judo Men's Extra-Lightweight", nan,
        'China'],
       [3, 'Gunnar Nielsen Aaby', 'M', ..., "Football Men's Football",
        nan, 'Denmark'],
       ...,
       [135570, 'Piotr ya', 'M', ...,
        "Ski Jumping Men's Large Hill, Team", nan, 'Poland'],
       [135571, 'Tomasz Ireneusz ya', 'M', ..., "Bobsleigh Men's Four",
        nan, 'Poland'],
       [135571, 'Tomasz Ireneusz ya', 'M', ..., "Bobsleigh Men's Four",
        nan, 'Poland']], dtype=object)

In [59]:
Y

array([nan, nan, nan, ..., nan, nan, nan], dtype=object)

In [61]:
# handling the missing data and replace missing values with nan from numpy and replace with mean of all the other values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') 
imputer = imputer.fit(X[:, 3:6])
X[:, 3:6] = imputer.transform(X[:, 3:6])

In [62]:
X[:, 3:6]

array([[24.0, 180.0, 80.0],
       [23.0, 170.0, 60.0],
       [24.0, 175.33896987366376, 70.70239290053351],
       ...,
       [27.0, 176.0, 59.0],
       [30.0, 185.0, 96.0],
       [34.0, 185.0, 96.0]], dtype=object)

In [63]:
df = pd.DataFrame(dataset)
df.groupby('Year').Medal.transform(lambda x: x.fillna(x.mode()[0]))

0         Bronze
1         Bronze
2           Gold
3           Gold
4         Bronze
           ...  
271111    Bronze
271112      Gold
271113      Gold
271114    Bronze
271115      Gold
Name: Medal, Length: 271116, dtype: object

In [64]:
# splitting the dataset into training set and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [65]:
X_train, X_test, Y_train, Y_test

(array([[107795, 'Uta Schtz', 'F', ...,
         "Swimming Women's 200 metres Freestyle", nan, 'Germany'],
        [2931, 'Rose "Rosie" Allwood (-Morrison)', 'F', ...,
         "Athletics Women's 4 x 100 metres Relay", nan, 'Jamaica'],
        [19866, 'Chaput', 'M', ..., "Cycling Men's Sprint", nan, 'France'],
        ...,
        [61965, 'Leopold Kohl', 'M', ...,
         "Cross Country Skiing Men's 18 kilometres", nan, 'Austria'],
        [76445, 'Mulomowandau Erick Mathoho', 'M', ...,
         "Football Men's Football", nan, 'South Africa'],
        [59683, 'David Taro Kikuchi', 'M', ...,
         "Gymnastics Men's Individual All-Around", nan, 'Canada']],
       dtype=object), array([[109485, 'Shek Wai Hung', 'M', ...,
         "Gymnastics Men's Pommelled Horse", nan, 'China'],
        [116150, 'Mlanie Suchet', 'F', ...,
         "Alpine Skiing Women's Downhill", nan, 'France'],
        [92674, 'Susan Jane "Sue" Pedersen (-Pankey)', 'F', ...,
         "Swimming Women's 100 metres Fr