In [31]:
# Importing the required modules
import pandas as pd
from collections import defaultdict
from os import getcwd

In [32]:
# Defining path to the different .csv files 
PATH_TO_PLAYERS = f"{getcwd()}/dataKeeper/players.csv"
PATH_TO_ONFIELD = f"{getcwd()}/dataKeeper/onfield.csv"
PATH_TO_ONFIELD2 = f"{getcwd()}/dataKeeper/onfield2.csv"
PATH_TO_COMMENTS = f"{getcwd()}/dataKeeper/comments.csv"

In [33]:
# Reading data from players.csv file and gettiing it's columns as a list
df_players = pd.read_csv(PATH_TO_PLAYERS)
players_columns = df_players.columns.tolist()
print(f"COLUMNS : {players_columns}")

COLUMNS : ['playerId', 'batsman', 'strike_rate']


In [34]:
# Reading data from onfield.csv file and getting it;s columns as a list
df_onfield = pd.read_csv(PATH_TO_ONFIELD)
onfield_columns = df_onfield.columns.tolist()
print(f"COLUMNS : {onfield_columns}")

COLUMNS : ['playerId', 'total_runs', 'number_of_balls']


In [35]:
# Reading data from onfield2.csv file and getting it's columns as a list
df_onfield2 = pd.read_csv(PATH_TO_ONFIELD2)
onfield2_columns = df_onfield2.columns.tolist()
print(f"COLUMNS : {onfield2_columns}")

COLUMNS : ['spectatorId', 'playerId', 'average', 'out']


In [36]:
# Reading data from comments.csv file and getting it's columns as a list
df_comments = pd.read_csv(PATH_TO_COMMENTS)
comments_columns = df_comments.columns.tolist()
print(f"COLUMNS : {comments_columns}")

COLUMNS : ['spectatorId', 'playerId', 'average', 'number_of_comments']


In [37]:
# Checking whether columns contains unique values in these dataframes or not
print(f" The column 'playerId' is unique in players dataframe: {pd.Series(df_players['playerId']).is_unique}")
print(f" The column 'playerId' is unique in onfield dataframe: {pd.Series(df_onfield['playerId']).is_unique}")
print(f" The column 'spectatorId' is unique in onfield2 dataframe: {pd.Series(df_onfield2['spectatorId']).is_unique}")
print(f" The column 'spectratorId' is unique in comments dataframe: {pd.Series(df_comments['spectatorId']).is_unique}")


 The column 'playerId' is unique in players dataframe: True
 The column 'playerId' is unique in onfield dataframe: True
 The column 'spectatorId' is unique in onfield2 dataframe: False
 The column 'spectratorId' is unique in comments dataframe: False


In [38]:
# Using playerId key as a primary key and sorting players and onfield dataframe according to it, as it is unique
df_players_sorted = df_players.sort_values(by=['playerId'])
df_onfield_sorted  = df_onfield.sort_values(by=['playerId'])

In [39]:
# Players dataframe
playerIds    = df_players_sorted["playerId"].tolist()
playerNames = df_players_sorted["batsman"].tolist()
playerStrikeRates = df_players["strike_rate"].tolist()

# Onfield dataframe
total_runs  = df_onfield_sorted["total_runs"].tolist()
number_of_balls  = df_onfield_sorted["number_of_balls"].tolist()

In [40]:
# Creating a playerDict dictionary
playerDict = {}
global_secondaryIndex = {}
for indx, playerId in enumerate(playerIds):
    playerDict[playerId] = {
        "strike_rate" : playerStrikeRates[indx],
        "onfield" : {
            "runs" : total_runs[indx], 
            "balls" : number_of_balls[indx]
        }
    }
    
    global_secondaryIndex[playerNames[indx]] = playerId

In [41]:
# Deleting unnecessary large data holding variables
del playerIds
del playerNames
del playerStrikeRates
del total_runs
del number_of_balls

In [42]:
# Converting all columns of onfield2 table into separate lists
spectatorIds       = df_onfield2["spectatorId"].tolist()
playerIds      = df_onfield2["playerId"].tolist()
playerAverages  = df_onfield2["average"].tolist()
playerOuts    = df_onfield2["out"].tolist()

# Creating a dictionary and populating it
performance = {}
for indx, mid in enumerate(playerIds):
    # Checking if the data to be feeded is already there or not
    try   : _ = performance[mid]
    except: 
        performance[mid]   = [
            {
                "spectatorId" : spectatorIds[indx],
                "out" : playerOuts[indx],
                "average" : playerAverages[indx]
            }
        ]
    
    try   : _ = performance[mid][spectatorIds[indx]]
    except: performance[mid].append(
            {
                "spectatorId" : spectatorIds[indx],
                "out" : playerOuts[indx],
                "average" : playerAverages[indx]
            }
        )

# Adding the data from performance to playerDict
for mid, _ in playerDict.items():
    try   : playerDict[mid]["out"] = performance[mid][1:]
    except: 
        # Player exists
        try   : playerDict[mid]["out"] = []
        # Player doesn't exist
        except: pass 

In [43]:
# Converting all columns of comments table into separate lists
spectatorIds    = df_comments["spectatorId"].tolist()
playerIds   = df_comments["playerId"].tolist()
spectator_comments   = df_comments["number_of_comments"].tolist()
playerAverages = df_comments["average"].tolist()

# Creating a dictionary and populating it
commenting = {}
for indx, mid in enumerate(playerIds):
    # Checking if the data to be feeded is already there or not
    try   : _ = commenting[mid]
    except: commenting[mid] = [
        {
            "spectatorId" : spectatorIds[indx],
            "number_of_comments" : spectator_comments[indx],
            "average" : playerAverages[indx]
        }
    ]
    
    try   : _ = commenting[mid][spectatorIds[indx]]
    except: commenting[mid].append(
            {
                "spectatorId" : spectatorIds[indx],
                "number_of_comments" : spectator_comments[indx],
                "average" : playerAverages[indx]
            }
        )

# Adding the data from commenting to playerDict
for mid, _ in playerDict.items():
    try   : playerDict[mid]["number_of_comments"] = commenting[mid][1:]
    except: 
        # Player exists
        try : playerDict[mid]["number_of_comments"] = [] 
        # Player doesn't exist
        except: del global_secondaryIndex[mid]

In [44]:
# Importing needed module 
# Uploading data and acknowledging the updation
import json
print("[INFO] Uploading player Data into the disk...")
with open('dataKeeper/dataBase.json', 'w') as fp:
    json.dump(playerDict, fp, sort_keys=True, indent=4)
print("[INFO] Uploading Global Secondary Index Data into the disk...")
with open('dataKeeper/dataBase_GSI.json', 'w') as fp:
    json.dump(global_secondaryIndex, fp, sort_keys=True, indent=4)

[INFO] Uploading player Data into the disk...
[INFO] Uploading Global Secondary Index Data into the disk...


In [45]:
# Running stringMatcher.py
# Importing required module
import numpy as np

# Creating the function below to measure the minimum number of edits required to convert one string into other
def levenshteinDistanceRatio(s, t, ratio_calc = True):

    # Initializing a matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Filling it with the indices of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Looping over it to compute the cost of single-character edits    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                # same charcter will cost 0
                cost = 0 
            else:                
                # Handling the results according to Python Levenshtein package
                if ratio_calc == True: cost = 2
                else: cost = 1
            
            # Finding least of single-character editing cost(deletion,insertion,substitution)
            distance[row][col] = min(distance[row-1][col] + 1,distance[row][col-1] + 1,distance[row-1][col-1] + cost)      
    
    # Lastly, Computing the Levenshtein Distance Ratio
    Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
    return Ratio