In [167]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [168]:
# load all the requred datasets
team_appearances = pd.read_csv('World_cup_teams.csv')
results = pd.read_csv('world cup results.csv')
rankings = pd.read_csv('rankings.csv')
fixtures_first = pd.read_excel('World_cup_fixtures.xlsx', sheet_name='Sheet2')
fixtures_second = pd.read_excel('World_cup_fixtures.xlsx', sheet_name='Sheet1')

In [169]:
# first stage is a group stage involving 8 teams, 4 teams will progress to the next stage
# the fixtures of this is given by fixtures_first
worldcup_teams_8 = ['Sri Lanka', 'Bangladesh','Netherlands','Ireland','Sctoland','Papua New Guinea','Namibia','Oman']
df_teams_1 = results[results['Team 1'].isin(worldcup_teams_8)]
df_teams_2 = results[results['Team 2'].isin(worldcup_teams_8)]
df_teams = pd.concat((df_teams_1, df_teams_2))
df_teams.drop_duplicates()
df_teams.count()

Team 1         84
Team 2         84
Result(Won)    84
dtype: int64

In [170]:
#convert team-1 and team-2 from categorical variables to continous inputs 
# Get dummy variables
final = pd.get_dummies(df_teams, prefix=['Team 1', 'Team 2'], columns=['Team 1', 'Team 2'])

# Separate X and y sets
X = final.drop(['Result(Won)'], axis=1)
y = final["Result(Won)"]


# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [171]:
final.head()

Unnamed: 0,Result(Won),Team 1_Afghanistan,Team 1_Australia,Team 1_Bangladesh,Team 1_Bangladesh.1,Team 1_England,Team 1_England.1,Team 1_India,Team 1_Ireland,Team 1_Netherlands,...,Team 2_Nepal,Team 2_Netherlands,Team 2_New Zealand,Team 2_Oman,Team 2_Pakistan,Team 2_South Africa,Team 2_Sri Lanka,Team 2_United Arab Emirates,Team 2_West Indies,Team 2_Zimbabwe
22,Pakistan,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
30,India,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,Ireland,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,Pakistan,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
37,Sri Lanka,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [172]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20,
                              random_state=0)
rf.fit(X_train, y_train) 


score = rf.score(X_train, y_train)
score2 = rf.score(X_test, y_test)


print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.966
Test set accuracy:  0.577


In [173]:
fixtures_1 = fixtures_first
fixtures_1.tail()
pred_set = []

In [174]:
# Loop to add teams to new prediction dataset based on the ranking position of each team
for index, row in fixtures_1.iterrows():
    if row['Team1 ranking'] < row['Team2 ranking']:
        pred_set.append({'Team 1': row['Team 1'], 'Team 2': row['Team 2'], 'winning_team': None})
    else:
        pred_set.append({'Team 1': row['Team 2'], 'Team 2': row['Team 1'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set
pred_set.head()

Unnamed: 0,Team 1,Team 2,winning_team
0,Sri Lanka,Ireland,
1,Papua New Guinea,Oman,
2,Bangladesh,Namibia,
3,Scotland,Netherlands,
4,Ireland,Oman,


In [175]:
# Get dummy variables and drop winning_team column
pred_set = pd.get_dummies(pred_set, prefix=['Team 1', 'Team 2'], columns=['Team 1', 'Team 2'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]


pred_set = pred_set.drop(['Result(Won)'], axis=1)
pred_set.head()

Unnamed: 0,Team 1_Afghanistan,Team 1_Australia,Team 1_Bangladesh,Team 1_Bangladesh.1,Team 1_England,Team 1_England.1,Team 1_India,Team 1_Ireland,Team 1_Netherlands,Team 1_New Zealand,...,Team 2_Nepal,Team 2_Netherlands,Team 2_New Zealand,Team 2_Oman,Team 2_Pakistan,Team 2_South Africa,Team 2_Sri Lanka,Team 2_United Arab Emirates,Team 2_West Indies,Team 2_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [176]:
standings=[]
predictions = rf.predict(pred_set)
for i in range(fixtures_first.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 1:
        a = ("Winner: " + backup_pred_set.iloc[i, 1])
    
    else:
        a = ("Winner: " + backup_pred_set.iloc[i, 0])
    print(a)  
    standings.append(a)
    print("")

 Ireland and Sri Lanka
Winner: Sri Lanka

Oman and Papua New Guinea  
Winner: Papua New Guinea  

Namibia and Bangladesh 
Winner: Bangladesh 

Netherlands  and  Scotland
Winner:  Scotland

Oman and Ireland 
Winner: Ireland 

Papua New Guinea and Sri Lanka 
Winner: Sri Lanka 

Namibia  and Scotland
Winner: Scotland

Netherlands and Bangladesh  
Winner: Bangladesh  

Papua New Guinea  and Ireland
Winner: Ireland

Oman and Sri Lanka 
Winner: Sri Lanka 

Namibia and Netherlands 
Winner: Netherlands 

Scotland and Bangladesh 
Winner: Bangladesh 



In [177]:
backup_pred_set.insert(3,'Results',standings)
del backup_pred_set['winning_team']
print(backup_pred_set)

                Team 1             Team 2                     Results
0            Sri Lanka            Ireland           Winner: Sri Lanka
1   Papua New Guinea                 Oman  Winner: Papua New Guinea  
2          Bangladesh             Namibia         Winner: Bangladesh 
3             Scotland       Netherlands            Winner:  Scotland
4             Ireland                Oman            Winner: Ireland 
5           Sri Lanka    Papua New Guinea          Winner: Sri Lanka 
6             Scotland           Namibia             Winner: Scotland
7         Bangladesh          Netherlands        Winner: Bangladesh  
8              Ireland  Papua New Guinea              Winner: Ireland
9           Sri Lanka                Oman          Winner: Sri Lanka 
10        Netherlands             Namibia        Winner: Netherlands 
11         Bangladesh            Scotland         Winner: Bangladesh 


In [178]:
#from the above predictions, it is evident that A1 (Winner of group A) is Sri lanka and A2(runner up of group A) is Ireland with 3 and 2 wins respectively
#similarly B1(Winner of group 1) is Bangladesh and B2(Runner up of group 2) is Scotland with 3 and 2 wins respectfully.
# in the second group stage fixtures, these teams to be added, so we replace them as below

fixtures_second['Team 1'] = fixtures_second['Team 1'].replace('A1', 'Sri Lanka')
fixtures_second['Team 2'] = fixtures_second['Team 2'].replace('A1', 'Sri Lanka')
fixtures_second['Team 1'] = fixtures_second['Team 1'].replace('A2', 'Ireland')
fixtures_second['Team 2'] = fixtures_second['Team 2'].replace('A2', 'Ireland')
fixtures_second['Team 1'] = fixtures_second['Team 1'].replace('B1', 'Bangladesh')
fixtures_second['Team 2'] = fixtures_second['Team 2'].replace('B1', 'Bangladesh')
fixtures_second['Team 1'] = fixtures_second['Team 1'].replace('B2', 'Scotland')
fixtures_second['Team 2'] = fixtures_second['Team 2'].replace('B2', 'Scotland')
    


In [14]:
print(fixtures_second)

# to add the rankings of team 1 and team 2 respectively
fixtures_second.to_excel('new_fixtures_1.xlsx')

           Date        Team 1        Team 2  Group
0   Oct 24, Sat    Australia       Pakistan    1.0
1   Oct 24, Sat        India   South Africa    2.0
2   Oct 25, Sun     Sri Lanka      Scotland    1.0
3   Oct 25, Sun  New Zealand    West Indies    1.0
4   Oct 26, Mon   Afghanistan       Ireland    2.0
5   Oct 26, Mon      England     Bangladesh    2.0
6   Oct 27, Tue    New Zeland      Scotland    1.0
7   Oct 28, Wed   Afghanistan    Bangladesh    2.0
8   Oct 28, Wed    Australia    West Indies    1.0
9   Oct 29, Thu      Pakistan     Sri Lanka    1.0
10  Oct 29, Thu         India       Ireland    2.0
11  Oct 30, Fri      England   South Africa    2.0
12  Oct 30, Fri   West Indies      Scotland    1.0
13  Oct 31, Sat  New Zealand       Pakistan    1.0
14  Oct 31, Sat    Australia      Sri Lanka    1.0
15   Nov 01,Sun   Afghanistan  South Africa    2.0
16   Nov 01,Sun         India       England    2.0
17  Nov 02, Mon       Ireland    Bangladesh    2.0
18  Nov 02, Mon    New Zeland  

In [179]:
#rankings of both the teams are added and is called here
fixtures = pd.read_excel('new_fixtures_1.xlsx')

In [180]:
#Now in this group stage, we have 12 teams deivided across 2 teams
worldcup_teams_12 = ['Sri Lanka', 'Bangladesh','Ireland','Sctoland','India','West Indies','Australia', 'England','South Africa','Pakistan','Afghanistan','New Zeland']
df_teams_1 = results[results['Team 1'].isin(worldcup_teams_12)]
df_teams_2 = results[results['Team 2'].isin(worldcup_teams_12)]
df_teams = pd.concat((df_teams_1, df_teams_2))
df_teams.drop_duplicates()
df_teams.count()

Team 1         239
Team 2         239
Result(Won)    239
dtype: int64

In [181]:
#convert team-1 and team-2 from categorical variables to continous inputs 
# Get dummy variables
final_1 = pd.get_dummies(df_teams, prefix=['Team 1', 'Team 2'], columns=['Team 1', 'Team 2'])
print(final_1)
# Separate X and y sets
X = final_1.drop(['Result(Won)'], axis=1)
y = final_1["Result(Won)"]
final_1.head()

# Separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

       Result(Won)  Team 1_Afghanistan  Team 1_Afghanistan   Team 1_Australia  \
6        No result                   0                    0                 0   
22       Pakistan                    0                    0                 0   
27    Netherlands                    0                    0                 0   
29    West Indies                    0                    0                 1   
30          India                    0                    0                 0   
32        England                    0                    0                 0   
33        Ireland                    0                    0                 0   
34      Sri Lanka                    0                    0                 1   
37      Sri Lanka                    0                    0                 0   
38          India                    0                    0                 0   
40   South Africa                    0                    0                 0   
43   South Africa           

In [182]:
rf1 = RandomForestClassifier(n_estimators=100, max_depth=20,
                              random_state=0)
rf1.fit(X_train, y_train) 


score = rf1.score(X_train, y_train)
score2 = rf1.score(X_test, y_test)


print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

Training set accuracy:  0.844
Test set accuracy:  0.722


In [183]:
# We only need the group stage games, so we have to slice the dataset (this contains both the group 1 and group 2 matches)

fixtures_2 = fixtures.iloc[:30,:]
print(fixtures_2)


    Unnamed: 0         Date        Team 1        Team 2  Group  \
0            0  Oct 24, Sat    Australia       Pakistan    1.0   
1            1  Oct 24, Sat        India   South Africa    2.0   
2            2  Oct 25, Sun     Sri Lanka      Scotland    1.0   
3            3  Oct 25, Sun  New Zealand    West Indies    1.0   
4            4  Oct 26, Mon   Afghanistan       Ireland    2.0   
5            5  Oct 26, Mon      England     Bangladesh    2.0   
6            6  Oct 27, Tue    New Zeland      Scotland    1.0   
7            7  Oct 28, Wed   Afghanistan    Bangladesh    2.0   
8            8  Oct 28, Wed    Australia    West Indies    1.0   
9            9  Oct 29, Thu      Pakistan     Sri Lanka    1.0   
10          10  Oct 29, Thu         India       Ireland    2.0   
11          11  Oct 30, Fri      England   South Africa    2.0   
12          12  Oct 30, Fri   West Indies      Scotland    1.0   
13          13  Oct 31, Sat  New Zealand       Pakistan    1.0   
14        

In [184]:
# Loop to add teams to new prediction dataset based on the ranking position of each team
pred_set_1 = []
for index, row in fixtures_2.iterrows():
    if row['Team 1 ranking'] < row['Team 2 Ranking']:
        pred_set_1.append({'Team 1': row['Team 1'], 'Team 2': row['Team 2'], 'winning_team': None})
    else:
        pred_set_1.append({'Team 1': row['Team 2'], 'Team 2': row['Team 1'], 'winning_team': None})
        
pred_set_1 = pd.DataFrame(pred_set_1)
backup_pred_set_1 = pred_set_1
print(backup_pred_set_1)

          Team 1        Team 2 winning_team
0     Australia       Pakistan         None
1         India   South Africa         None
2      Sri Lanka      Scotland         None
3   New Zealand    West Indies         None
4    Afghanistan       Ireland         None
5       England     Bangladesh         None
6     New Zeland      Scotland         None
7     Bangladesh   Afghanistan         None
8     Australia    West Indies         None
9       Pakistan     Sri Lanka         None
10         India       Ireland         None
11      England   South Africa         None
12   West Indies      Scotland         None
13      Pakistan  New Zealand          None
14    Australia      Sri Lanka         None
15  South Africa   Afghanistan         None
16       England         India         None
17    Bangladesh       Ireland         None
18    New Zeland     Sri Lanka         None
19      Pakistan   West Indies         None
20    Australia       Scotland         None
21       England   Afghanistan  

In [185]:
# Get dummy variables and drop winning_team column
pred_set_1 = pd.get_dummies(pred_set_1, prefix=['Team 1', 'Team 2'], columns=['Team 1', 'Team 2'])

# Add missing columns compared to the model's training dataset
missing_cols = set(final_1.columns) - set(pred_set_1.columns)
for c in missing_cols:
    pred_set_1[c] = 0
pred_set_1 = pred_set_1[final_1.columns]


pred_set_1 = pred_set_1.drop(['Result(Won)'], axis=1)
print(pred_set_1.head(5))

   Team 1_Afghanistan  Team 1_Afghanistan   Team 1_Australia  \
0                   0                    0                 0   
1                   0                    0                 0   
2                   0                    0                 0   
3                   0                    0                 0   
4                   1                    0                 0   

   Team 1_Bangladesh  Team 1_Bangladesh   Team 1_England  Team 1_England   \
0                  0                   0               0                0   
1                  0                   0               0                0   
2                  0                   0               0                0   
3                  0                   0               0                0   
4                  0                   0               0                0   

   Team 1_India  Team 1_India   Team 1_India v  ...  Team 2_Netherlands  \
0             0              0               0  ...                   0   
1 

In [186]:
standings_1=[]
predictions_1 = rf1.predict(pred_set_1)
for j in range(fixtures_2.shape[0]):
    print(backup_pred_set_1.iloc[j, 1] + " and " + backup_pred_set_1.iloc[j, 0])
    if predictions_1[i] == 1:
        b = ("Winner: " + backup_pred_set_1.iloc[j, 1])
    else:
        b = ("Winner: " + backup_pred_set_1.iloc[j, 0])
    print(b)  
    standings_1.append(b)
    print("")

Pakistan and Australia 
Winner: Australia 

South Africa and India 
Winner: India 

Scotland and Sri Lanka
Winner: Sri Lanka

West Indies and New Zealand 
Winner: New Zealand 

Ireland and Afghanistan
Winner: Afghanistan

Bangladesh and England 
Winner: England 

Scotland and New Zeland
Winner: New Zeland

Afghanistan and Bangladesh
Winner: Bangladesh

West Indies and Australia 
Winner: Australia 

Sri Lanka and Pakistan
Winner: Pakistan

Ireland and India
Winner: India

South Africa and England 
Winner: England 

Scotland and West Indies
Winner: West Indies

New Zealand  and  Pakistan
Winner:  Pakistan

Sri Lanka and Australia 
Winner: Australia 

Afghanistan and South Africa
Winner: South Africa

India and England
Winner: England

Ireland and Bangladesh
Winner: Bangladesh

Sri Lanka and New Zeland
Winner: New Zeland

West Indies and Pakistan
Winner: Pakistan

Scotland and Australia 
Winner: Australia 

Afghanistan and England
Winner: England

Ireland and South Africa
Winner: South Af

In [187]:
backup_pred_set_1.insert(3,'Results',standings_1)
del backup_pred_set_1['winning_team']
print(backup_pred_set_1)


          Team 1        Team 2               Results
0     Australia       Pakistan    Winner: Australia 
1         India   South Africa        Winner: India 
2      Sri Lanka      Scotland     Winner: Sri Lanka
3   New Zealand    West Indies  Winner: New Zealand 
4    Afghanistan       Ireland   Winner: Afghanistan
5       England     Bangladesh      Winner: England 
6     New Zeland      Scotland    Winner: New Zeland
7     Bangladesh   Afghanistan    Winner: Bangladesh
8     Australia    West Indies    Winner: Australia 
9       Pakistan     Sri Lanka      Winner: Pakistan
10         India       Ireland         Winner: India
11      England   South Africa      Winner: England 
12   West Indies      Scotland   Winner: West Indies
13      Pakistan  New Zealand      Winner:  Pakistan
14    Australia      Sri Lanka    Winner: Australia 
15  South Africa   Afghanistan  Winner: South Africa
16       England         India       Winner: England
17    Bangladesh       Ireland    Winner: Bang

In [188]:
def clean_and_predict(matches, rankings, final, logreg):

    # Initialization of auxiliary list for data cleaning
    positions = []

    # Loop to retrieve each team's position according to ICC ranking
    for match in matches:
        positions.append(rankings.loc[rankings['Teams'] == match[0],'Ranking'].iloc[0])
        positions.append(rankings.loc[rankings['Teams'] == match[1],'Ranking'].iloc[0])
    
    # Creating the DataFrame for prediction
    pred_set = []

    # Initializing iterators for while loop
    i = 0
    j = 0

    # 'i' will be the iterator for the 'positions' list, and 'j' for the list of matches (list of tuples)
    while i < len(positions):
        dict1 = {}

        # If position of first team is better then this team will be the 'Team_1' team, and vice-versa
        if positions[i] < positions[i + 1]:
            dict1.update({'Team_1': matches[j][0], 'Team_2': matches[j][1]})
        else:
            dict1.update({'Team_1': matches[j][1], 'Team_2': matches[j][0]})

        # Append updated dictionary to the list, that will later be converted into a DataFrame
        pred_set.append(dict1)
        i += 2
        j += 1
        
        # Convert list into DataFrame
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set

    # Get dummy variables and drop winning_team column
    pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

    # Add missing columns compared to the model's training dataset
    missing_cols2 = set(final_1.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final_1.columns]

    pred_set = pred_set.drop(['Result(Won)'], axis=1)

    # Predict!
    predictions = logreg.predict(pred_set)
    for i in range(len(pred_set)):
        print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
        if predictions[i] == 1:
            print("Winner: " + backup_pred_set.iloc[i, 1])
        else:
            print("Winner: " + backup_pred_set.iloc[i, 0])
        print("")

In [189]:
# 1st semifinal - Group 1 winner vs Group 2 runner up

# 2nd semifinal - Group 2 winner vs Group 1 runner up

semi = [('Australia', 'India'),
            ('England', 'Pakistan')]

In [190]:

clean_and_predict(semi, rankings, final, rf1)


India and Australia
Winner: Australia

Pakistan and England
Winner: England



In [191]:
# finals between the winners of both the semifinals

finals = [('Australia', 'England')]


In [192]:
clean_and_predict(finals, rankings, final, rf1)

England and Australia
Winner: Australia

