In [None]:
# import library
import pandas as pd
from collections import Counter

# import ML library
from sklearn import svm, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier


# import evaluation metrics
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("HR_dataset.csv")
data.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,0,49.0,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,60.0,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,50.0,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,50.0,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,73.0,0


In [None]:
data['is_promoted'].value_counts()

0    48443
1     4509
Name: is_promoted, dtype: int64

In [None]:
# Select desired columns
data = data[["department","education","gender","recruitment_channel",
             "no_of_trainings","age","previous_year_rating","length_of_service","awards_won","avg_training_score","is_promoted"]]

In [None]:
data_encoded = pd.get_dummies(data, prefix_sep="_")
data_encoded = data_encoded.dropna()
data_encoded.head()

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,awards_won,avg_training_score,is_promoted,department_Analytics,department_Finance,department_HR,department_Legal,department_Operations,department_Procurement,department_R&D,department_Sales & Marketing,department_Technology,education_Bachelor's,education_Below Secondary,education_Master's & above,gender_f,gender_m,recruitment_channel_other,recruitment_channel_referred,recruitment_channel_sourcing
0,1,35,5.0,8,0,49.0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1
1,1,30,5.0,4,0,60.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,0
2,1,34,3.0,7,0,50.0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1
3,2,39,1.0,10,0,50.0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,0,0
4,1,45,3.0,2,0,73.0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,0


In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy = 1)

x= data_encoded.drop(columns="is_promoted")
y=data_encoded["is_promoted"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print(Counter(y_train))
print(Counter(y_train_sm))

Counter({0: 34110, 1: 3231})
Counter({0: 34110, 1: 34110})


In [None]:
clf = svm.SVC(C=1)
clf.fit(X_train, y_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
# Evaluation
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)

training_acc = accuracy_score(y_train, y_predict_train)
testing_acc = accuracy_score(y_test, y_predict_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))

print(classification_report(y_test, y_predict_test))

Training Accuracy: 0.913473126054471
Testing Accuracy: 0.9145244215938303
              precision    recall  f1-score   support

           0       0.91      1.00      0.96      8538
           1       0.00      0.00      0.00       798

    accuracy                           0.91      9336
   macro avg       0.46      0.50      0.48      9336
weighted avg       0.84      0.91      0.87      9336



In [None]:
data_2 = pd.read_csv("fifa.csv")
data_2.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,Wage,Special,Preferred Foot,International Reputation,Weak Foot,Skill Moves,Work Rate,Body Type,Real Face,Position,Jersey Number,Joined,Loaned From,Contract Valid Until,Height,Weight,LS,ST,RS,LW,LF,CF,RF,RW,LAM,CAM,RAM,LM,...,LB,LCB,CB,RCB,RB,Crossing,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,https://cdn.sofifa.org/teams/2/light/241.png,€110.5M,€565K,2202,Left,5.0,4.0,4.0,Medium/ Medium,Messi,Yes,RF,10.0,"Jul 1, 2004",,2021,5'7,159lbs,88+2,88+2,88+2,92+2,93+2,93+2,93+2,92+2,93+2,93+2,93+2,91+2,...,59+2,47+2,47+2,47+2,59+2,84.0,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,https://cdn.sofifa.org/teams/2/light/45.png,€77M,€405K,2228,Right,5.0,4.0,5.0,High/ Low,C. Ronaldo,Yes,ST,7.0,"Jul 10, 2018",,2022,6'2,183lbs,91+3,91+3,91+3,89+3,90+3,90+3,90+3,89+3,88+3,88+3,88+3,88+3,...,61+3,53+3,53+3,53+3,61+3,84.0,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,https://cdn.sofifa.org/teams/2/light/73.png,€118.5M,€290K,2143,Right,5.0,5.0,5.0,High/ Medium,Neymar,Yes,LW,10.0,"Aug 3, 2017",,2022,5'9,150lbs,84+3,84+3,84+3,89+3,89+3,89+3,89+3,89+3,89+3,89+3,89+3,88+3,...,60+3,47+3,47+3,47+3,60+3,79.0,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,https://cdn.sofifa.org/teams/2/light/11.png,€72M,€260K,1471,Right,4.0,3.0,1.0,Medium/ Medium,Lean,Yes,GK,1.0,"Jul 1, 2011",,2020,6'4,168lbs,,,,,,,,,,,,,...,,,,,,17.0,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,https://cdn.sofifa.org/teams/2/light/10.png,€102M,€355K,2281,Right,4.0,5.0,4.0,High/ High,Normal,Yes,RCM,7.0,"Aug 30, 2015",,2023,5'11,154lbs,82+3,82+3,82+3,87+3,87+3,87+3,87+3,87+3,88+3,88+3,88+3,88+3,...,73+3,66+3,66+3,66+3,73+3,93.0,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [None]:
# get the required variable
data_2 = data_2[["Position", 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes']]

In [None]:
data_2.head()

Unnamed: 0,Position,Finishing,HeadingAccuracy,ShortPassing,Volleys,Dribbling,Curve,FKAccuracy,LongPassing,BallControl,Acceleration,SprintSpeed,Agility,Reactions,Balance,ShotPower,Jumping,Stamina,Strength,LongShots,Aggression,Interceptions,Positioning,Vision,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes
0,RF,95.0,70.0,90.0,86.0,97.0,93.0,94.0,87.0,96.0,91.0,86.0,91.0,95.0,95.0,85.0,68.0,72.0,59.0,94.0,48.0,22.0,94.0,94.0,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0
1,ST,94.0,89.0,81.0,87.0,88.0,81.0,76.0,77.0,94.0,89.0,91.0,87.0,96.0,70.0,95.0,95.0,88.0,79.0,93.0,63.0,29.0,95.0,82.0,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,LW,87.0,62.0,84.0,84.0,96.0,88.0,87.0,78.0,95.0,94.0,90.0,96.0,94.0,84.0,80.0,61.0,81.0,49.0,82.0,56.0,36.0,89.0,87.0,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0
3,GK,13.0,21.0,50.0,13.0,18.0,21.0,19.0,51.0,42.0,57.0,58.0,60.0,90.0,43.0,31.0,67.0,43.0,64.0,12.0,38.0,30.0,12.0,68.0,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0
4,RCM,82.0,55.0,92.0,82.0,86.0,85.0,83.0,91.0,91.0,78.0,76.0,79.0,91.0,77.0,91.0,63.0,90.0,75.0,91.0,76.0,61.0,87.0,94.0,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0


In [None]:
# categorizing position
forward_player = ["ST", "LW", "RW", "LF", "RF", "RS","LS", "CF"]
midfielder_player = ["CM","RCM","LCM", "CDM","RDM","LDM", "CAM", "LAM", "RAM", "RM", "LM"]
defender_player = ["CB", "RCB", "LCB", "LWB", "RWB", "LB", "RB"]

# labeling the position
data_2.loc[data_2["Position"] == "GK", "Position"] = 0

data_2.loc[data_2["Position"].isin(defender_player), "Position"] = 1

data_2.loc[data_2["Position"].isin(midfielder_player), "Position"] = 2

data_2.loc[data_2["Position"].isin(forward_player), "Position"] = 3

data_2 = data_2.dropna()
data_2['Position'].value_counts()


2    6838
1    5866
3    3418
0    2025
Name: Position, dtype: int64

In [None]:
data_2["Position"] = data_2["Position"].astype("int64")
data_2.dtypes

Position             int64
Finishing          float64
HeadingAccuracy    float64
ShortPassing       float64
Volleys            float64
Dribbling          float64
Curve              float64
FKAccuracy         float64
LongPassing        float64
BallControl        float64
Acceleration       float64
SprintSpeed        float64
Agility            float64
Reactions          float64
Balance            float64
ShotPower          float64
Jumping            float64
Stamina            float64
Strength           float64
LongShots          float64
Aggression         float64
Interceptions      float64
Positioning        float64
Vision             float64
Penalties          float64
Composure          float64
Marking            float64
StandingTackle     float64
SlidingTackle      float64
GKDiving           float64
GKHandling         float64
GKKicking          float64
GKPositioning      float64
GKReflexes         float64
dtype: object

In [None]:
X = data_2.drop(columns='Position')
y = data_2['Position']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [None]:
lr_list = [0.05,0.1,0.15,0.2,0.25,0.5,0.75,1]

for lr in lr_list:
  clf = GradientBoostingClassifier(n_estimators=50, learning_rate=lr)
  clf.fit(X_train, y_train)

  print("Learning rate:", lr)
  print("Test Accuracy:", clf.score(X_test, y_test))
  print()

Learning rate: 0.05
Test Accuracy: 0.8652892561983471

Learning rate: 0.1
Test Accuracy: 0.8757575757575757

Learning rate: 0.15
Test Accuracy: 0.878236914600551

Learning rate: 0.2
Test Accuracy: 0.874931129476584

Learning rate: 0.25
Test Accuracy: 0.8785123966942149

Learning rate: 0.5
Test Accuracy: 0.8735537190082645

Learning rate: 0.75
Test Accuracy: 0.8647382920110193

Learning rate: 1
Test Accuracy: 0.8578512396694215



In [None]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.svm import LinearSVC

In [None]:
clf = OneVsOneClassifier(LinearSVC()).fit(X_train,y_train)

prediction = clf.predict(X_test)
print("Classification Report")
print(classification_report(y_test, prediction))
print("Accuracy: ",clf.score(X_test, y_test))

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       423
           1       0.96      0.79      0.86      1175
           2       0.74      0.92      0.82      1352
           3       0.89      0.71      0.79       680

    accuracy                           0.85      3630
   macro avg       0.89      0.86      0.87      3630
weighted avg       0.87      0.85      0.85      3630

Accuracy:  0.8487603305785124


In [None]:
from sklearn.linear_model import LogisticRegression

clf = OneVsOneClassifier(LogisticRegression()).fit(X_train,y_train)

prediction = clf.predict(X_test)
print("Classification Report")
print(classification_report(y_test, prediction))
print("Accuracy: ",clf.score(X_test, y_test))

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       423
           1       0.91      0.92      0.91      1175
           2       0.83      0.85      0.84      1352
           3       0.84      0.80      0.82       680

    accuracy                           0.88      3630
   macro avg       0.90      0.89      0.89      3630
weighted avg       0.88      0.88      0.88      3630

Accuracy:  0.8798898071625344


In [None]:
clf_2 = LogisticRegression().fit(X_train,y_train)

prediction = clf_2.predict(X_test)
print("Classification Report")
print(classification_report(y_test, prediction))
print("Accuracy: ",clf_2.score(X_test, y_test))

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       423
           1       0.91      0.91      0.91      1175
           2       0.83      0.85      0.84      1352
           3       0.84      0.79      0.82       680

    accuracy                           0.88      3630
   macro avg       0.89      0.89      0.89      3630
weighted avg       0.88      0.88      0.88      3630

Accuracy:  0.8771349862258954


In [None]:
clf = OneVsRestClassifier(LinearSVC()).fit(X_train,y_train)

prediction = clf.predict(X_test)
print("Classification Report")
print(classification_report(y_test, prediction))
print("Accuracy: ",clf.score(X_test, y_test))

Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       423
           1       0.72      0.99      0.83      1175
           2       0.88      0.57      0.69      1352
           3       0.78      0.82      0.80       680

    accuracy                           0.80      3630
   macro avg       0.84      0.84      0.83      3630
weighted avg       0.82      0.80      0.79      3630

Accuracy:  0.8019283746556474


In [None]:
from sklearn.linear_model import LogisticRegression

clf = OneVsRestClassifier(LogisticRegression()).fit(X_train,y_train)

prediction = clf.predict(X_test)
print("Classification Report")
print(classification_report(y_test, prediction))
print("Accuracy: ",clf.score(X_test, y_test))

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       423
           1       0.91      0.91      0.91      1175
           2       0.82      0.84      0.83      1352
           3       0.83      0.78      0.81       680

    accuracy                           0.87      3630
   macro avg       0.89      0.88      0.89      3630
weighted avg       0.87      0.87      0.87      3630

Accuracy:  0.8721763085399449
