In [1]:
!pip install scikit-learn



In [3]:
import pandas as pd
import numpy as np 
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib
matplotlib.use('TkAgg')  

In [5]:
df = pd.read_csv(r"C:\Users\Vigan\Downloads\forbesathletesv2.csv")

In [7]:
df.head()

Unnamed: 0,Name,Earnings,Year,Sport
0,Conor McGregor,180.0,2021,Mixed Martial Arts
1,Lionel Messi,130.0,2021,Soccer
2,Cristiano Ronaldo,120.0,2021,Soccer
3,Dak Prescott,107.5,2021,Football
4,LeBron James,96.5,2021,Basketball


In [8]:
df.shape

(1659, 4)

In [9]:
print("Data types of columns:")
print(df.dtypes)

Data types of columns:
Name         object
Earnings    float64
Year          int64
Sport        object
dtype: object


In [10]:
duplicate_rows = df[df.duplicated()]
if not duplicate_rows.empty:
    print("Duplicate rows found:")
    print(duplicate_rows)
else:
    print("No duplicate rows found.")

No duplicate rows found.


In [11]:
null_values = df.isnull().sum()
print(null_values)

Name        0
Earnings    0
Year        0
Sport       0
dtype: int64


In [22]:
plt.figure(figsize=(25, 10))
plt.rc('font', size=15)
plt.rc('xtick', labelsize=15)    
plt.rc('ytick', labelsize=15)

athlete_salary = df.groupby('Name')['Earnings'].sum().sort_values(ascending=False)[:10]
names = [ name for name in athlete_salary.index]
sns.barplot(x=names, y=athlete_salary*10**6)
plt.title('All Time Top 10 Athletes Salary Rank')
plt.show()

In [56]:
athlete_years = df.groupby(['Name', 'Year'])['Earnings'].sum().sort_values(ascending=False)[:20]
year = [years[1] for years in athlete_years.index]
name = [names[0] for names in athlete_years.index]

#The names from the name variable, and we are going to get the unique list of names.
names_df = pd.DataFrame({'names':['Floyd Mayweather', 'Floyd Mayweather', 'Conor McGregor', 'Manny Pacquiao', 'Lionel Messi', 'Lionel Messi', 'Cristiano Ronaldo', 'Tiger Woods', 'Lionel Messi', 'Tiger Woods', 'Cristiano Ronaldo', 'Cristiano Ronaldo', 'Dak Prescott', 'Roger Federer', 'Cristiano Ronaldo', 'Tiger Woods', 'Floyd Mayweather', 'Neymar', 'Lionel Messi', 'Tiger Woods']})
unique_names = names_df['names'].unique()
list_of_years = pd.DataFrame(year)
unique_years = list_of_years[0].unique()

#Proceed in filtering the main data, with the unique names and so as the years
filter1 = df['Name'].isin(['Floyd Mayweather', 'Conor McGregor', 'Manny Pacquiao', 'Lionel Messi',
 'Cristiano Ronaldo', 'Tiger Woods', 'Dak Prescott', 'Roger Federer', 'Neymar',])
filter2 = df['Year'].isin([2021, 2020, 2019, 2018, 2015, 2014, 2010, 2009, 2008, 2007])

#Filtering the athletes data with our filtering variables
filter2 = df['Year'].isin([2021, 2020, 2019, 2018, 2015, 2014, 2010, 2009, 2008, 2007])
df[filter1 & filter2]

Unnamed: 0,Name,Earnings,Year,Sport,sport_encoded
0,Conor McGregor,180.0,2021,Mixed Martial Arts,10
1,Lionel Messi,130.0,2021,Soccer,12
2,Cristiano Ronaldo,120.0,2021,Soccer,12
3,Dak Prescott,107.5,2021,Football,5
5,Neymar,95.0,2021,Soccer,12
6,Roger Federer,90.0,2021,Tennis,13
12,Tiger Woods,60.0,2021,Golf,7
50,Roger Federer,106.3,2020,Tennis,13
51,Cristiano Ronaldo,105.0,2020,Soccer,12
52,Lionel Messi,104.0,2020,Soccer,12


In [28]:
top_10_by_year = ['2021 Connor McGregor', '2020 Roger Federer', '2019 Lionel Messi', '2018 Floyd Mayweather', 
               '2015 Floyd Mayweather', '2014 Floyd Mayweather', '2010 Tiger Woods', 
               '2009 Tiger Woods', '2008 Tiger Woods', '2007 Tiger Woods']

In [57]:
plt.figure(figsize=(25, 10))

sns.barplot(x=year, y=athlete_years)
plt.legend(top_10_by_year)
plt.title('Top Athletes Earning from year 2007 - 2021')
plt.ylabel('Earnings per million of dollars')
plt.xlabel('Years')
plt.show()

In [58]:
plt.figure(figsize=(30, 10))

sports = df.groupby('Sport')['Earnings'].sum().sort_values(ascending=False)
sport_kind = [sport for sport in sports.index]
sns.barplot(x=sport_kind, y=sports)
plt.show()

In [60]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

print(df.head())

# Select features for clustering (e.g., earnings and endorsements)
features = ['Earnings', 'Year']

# Prepare the data for clustering
X = df[features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction (optional)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Determine the optimal number of clusters using the elbow method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X_pca)
    wcss.append(kmeans.inertia_)

# Plot the elbow method
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# Based on the elbow method, select the optimal number of clusters
n_clusters = 3

# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
kmeans.fit(X_pca)

# Visualize the clusters
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans.labels_, cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red', marker='x', label='Centroids')
plt.title('KMeans Clustering of Forbes Highest Paid Athletes')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

                Name  Earnings  Year               Sport  sport_encoded
0     Conor McGregor     180.0  2021  Mixed Martial Arts             10
1       Lionel Messi     130.0  2021              Soccer             12
2  Cristiano Ronaldo     120.0  2021              Soccer             12
3       Dak Prescott     107.5  2021            Football              5
4       LeBron James      96.5  2021          Basketball              1
