# Mod 2 Summative Project - Soccer Match DB Transfer and Analysis

Importing necessary libraries for work below:

In [1]:
from SqlConn import SqlConn
from MongoHandler import MongoHandler
from PandaHandler import PandaHandler
from RainDataImproved import RainData
from dark_sky_api_remy import api_key as api_key
import numpy as np
import pandas as pd
from matplotlib.image import imread
import matplotlib.pyplot as plt
%matplotlib inline

#### Installing unlikely libraries for use in this notebook
Not a part of the Anaconda distribution

In [3]:
!pip install timezonefinder



In [4]:
!pip install geopy



## Making a connection to the SQL Database and getting back a Pandas DataFrame

In [5]:
sql = SqlConn('database.sqlite')

Connection status: Active


In [6]:
df = sql.matches_df([2011])
df.head()

Connection status: Active


Unnamed: 0,Match_ID,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,1133,D2,2011,2011-07-15,Cottbus,Dresden,2,1,H
1,1167,D2,2011,2011-07-15,Greuther Furth,Ein Frankfurt,2,3,A
2,1551,D2,2011,2011-07-15,Frankfurt FSV,Union Berlin,1,1,D
3,1550,D2,2011,2011-07-16,Erzgebirge Aue,Aachen,1,0,H
4,1678,D2,2011,2011-07-16,St Pauli,Ingolstadt,2,0,H


### Be sure to close your connection when done querying the Database!

In [7]:
sql.close_conn()

Closing connection
Connection status: Closed


## Working with the Pandas DataFrame - EDA

In [8]:
df = PandaHandler.tot_home_goals_scored(df)
df.head()

Unnamed: 0,Match_ID,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,tot_home_goals
0,1133,D2,2011,2011-07-15,Cottbus,Dresden,2,1,H,30
1,1167,D2,2011,2011-07-15,Greuther Furth,Ein Frankfurt,2,3,A,73
2,1551,D2,2011,2011-07-15,Frankfurt FSV,Union Berlin,1,1,D,43
3,1550,D2,2011,2011-07-16,Erzgebirge Aue,Aachen,1,0,H,31
4,1678,D2,2011,2011-07-16,St Pauli,Ingolstadt,2,0,H,59


In [9]:
df = PandaHandler.win_loss_draw(df)
df.head()

Unnamed: 0,Match_ID,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,tot_home_goals,HomeWin,AwayWin,tot_home_win,HomeLoss,AwayLoss,tot_home_loss,HomeDraw,AwayDraw,tot_home_draw
0,1133,D2,2011,2011-07-15,Cottbus,Dresden,2,1,H,30,1,0,8,0,1,15,0,0,11
1,1167,D2,2011,2011-07-15,Greuther Furth,Ein Frankfurt,2,3,A,73,0,1,20,1,0,4,0,0,10
2,1551,D2,2011,2011-07-15,Frankfurt FSV,Union Berlin,1,1,D,43,0,0,7,0,0,13,1,1,14
3,1550,D2,2011,2011-07-16,Erzgebirge Aue,Aachen,1,0,H,31,1,0,8,0,1,15,0,0,11
4,1678,D2,2011,2011-07-16,St Pauli,Ingolstadt,2,0,H,59,1,0,18,0,1,8,0,0,8


## Querying the Dark Sky API

Beware, this returns ~290 entries, and with Dark Sky API limits, you can only query 1000 entries per day. Should you exceed limits, please use a new API key

In [7]:
raindata = RainData(df.iloc[0].Date, df.iloc[-1].Date)

In [8]:
raindf = raindata.get_rain_df(api_key)

In [9]:
raindf.head()

Unnamed: 0,Date,Rain
0,2011-07-15,0
1,2011-07-16,0
2,2011-07-17,1
3,2011-07-18,1
4,2011-07-19,0


### Store the queried data from Dark Sky for use later 
(so if issues arise, you don't need to query again)

In [None]:
raindf.to_pickle('pickled_rain_df.pkl')

Read back in the Pandas Dataframe for use below

In [10]:
raindf = pd.read_pickle('pickled_rain_df.pkl')

In [11]:
raindf.head()

Unnamed: 0,Date,Rain
0,2011-07-15,0
1,2011-07-16,0
2,2011-07-17,1
3,2011-07-18,1
4,2011-07-19,0


In [12]:
df_all_data = PandaHandler.rain_results(df, raindf)
df_all_data.head(10)

Total Number of Games: 34


Unnamed: 0_level_0,Season,GoalsScored,Wins,Losses,Draws,RainGames,RainWins,NonRainWins,RainWin%,NonRainWin%,%ChangeWinWithRain
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Aachen,2011,30,6,15,13,12.0,2,4,0.166667,0.181818,-0.083333
Augsburg,2011,36,8,12,14,11.0,4,4,0.363636,0.173913,1.090909
Bayern Munich,2011,77,23,7,4,11.0,8,15,0.727273,0.652174,0.115152
Bochum,2011,41,10,17,7,10.0,3,7,0.3,0.291667,0.028571
Braunschweig,2011,37,10,9,15,8.0,3,7,0.375,0.269231,0.392857
Cottbus,2011,30,8,15,11,9.0,2,6,0.222222,0.24,-0.074074
Dortmund,2011,80,25,3,6,10.0,6,19,0.6,0.791667,-0.242105
Dresden,2011,50,12,13,9,8.0,3,9,0.375,0.346154,0.083333
Duisburg,2011,42,10,15,9,15.0,4,6,0.266667,0.315789,-0.155556
Ein Frankfurt,2011,76,20,6,8,8.0,3,17,0.375,0.653846,-0.426471


### Create all images in local folder for input into MongoDB

In [13]:
season = str(int(df_all_data.iloc[0].Season))
cols = ['Wins', 'Losses', 'Draws']
bar_x = [season+'_Wins', season+'_Losses', season+'_Draws']

for i in range(len(df_all_data)):
    bar_y = list(df_all_data[cols].iloc[i])
    team_name = df_all_data.index[i]
    plt.bar(x=bar_x, height=bar_y)
    plt.title(team_name)
    plt.savefig('hist_images/{}.png'.format(team_name))
    plt.clf()

<Figure size 432x288 with 0 Axes>

#### Turns all images into numpy arrays, then cast as a list, for MongoDB interpretation

In [14]:
df_all_data['graph'] = [imread('hist_images/{}.png'.format(team_name)).tolist() for team_name in df_all_data.index]
df_all_data.head()

Unnamed: 0_level_0,Season,GoalsScored,Wins,Losses,Draws,RainGames,RainWins,NonRainWins,RainWin%,NonRainWin%,%ChangeWinWithRain,graph
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Aachen,2011,30,6,15,13,12.0,2,4,0.166667,0.181818,-0.083333,"[[[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0], ..."
Augsburg,2011,36,8,12,14,11.0,4,4,0.363636,0.173913,1.090909,"[[[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0], ..."
Bayern Munich,2011,77,23,7,4,11.0,8,15,0.727273,0.652174,0.115152,"[[[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0], ..."
Bochum,2011,41,10,17,7,10.0,3,7,0.3,0.291667,0.028571,"[[[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0], ..."
Braunschweig,2011,37,10,9,15,8.0,3,7,0.375,0.269231,0.392857,"[[[1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0], ..."


## Mongo DB

**Please ensure** that your Mongo Database has been opened through the terminal for smooth operation.

In [15]:
client = MongoHandler("mongodb://127.0.0.1:27017/")

In [16]:
collection = client.make_collection('team_stats_db', 'team_stats_collection')
collection

Now in the team_stats_collection collection in the team_stats_db database


Collection(Database(MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True), 'team_stats_db'), 'team_stats_collection')

In [19]:
client.clear_collection('team_stats_db','team_stats_collection')

Collection team_stats_collection cleared


In [20]:
results = collection.insert_many(MongoHandler.list_of_dicts(df_all_data))
print('Items inserted: ' + str(len(results.inserted_ids)))

Items inserted: 36


In [21]:
query = client.query_db('team_stats_db', 'team_stats_collection')
for item in query:
    print(item)

{'name': 'Aachen', 'Season': 2011, 'GoalsScored': 30, 'Wins': 6, 'Losses': 15, 'Draws': 13, 'RainGames': 12, 'RainWins': 2, 'NonRainWins': 4, 'RainWin%': 0.1667, 'NonRainWin%': 0.1818, '%ChangeWinWithRain': -0.0833}
{'name': 'Augsburg', 'Season': 2011, 'GoalsScored': 36, 'Wins': 8, 'Losses': 12, 'Draws': 14, 'RainGames': 11, 'RainWins': 4, 'NonRainWins': 4, 'RainWin%': 0.3636, 'NonRainWin%': 0.1739, '%ChangeWinWithRain': 1.0909}
{'name': 'Bayern Munich', 'Season': 2011, 'GoalsScored': 77, 'Wins': 23, 'Losses': 7, 'Draws': 4, 'RainGames': 11, 'RainWins': 8, 'NonRainWins': 15, 'RainWin%': 0.7273, 'NonRainWin%': 0.6522, '%ChangeWinWithRain': 0.1152}
{'name': 'Bochum', 'Season': 2011, 'GoalsScored': 41, 'Wins': 10, 'Losses': 17, 'Draws': 7, 'RainGames': 10, 'RainWins': 3, 'NonRainWins': 7, 'RainWin%': 0.3, 'NonRainWin%': 0.2917, '%ChangeWinWithRain': 0.0286}
{'name': 'Braunschweig', 'Season': 2011, 'GoalsScored': 37, 'Wins': 10, 'Losses': 9, 'Draws': 15, 'RainGames': 8, 'RainWins': 3, 'Non