# YGO-Code Net Deck - Main Notebook
This notebook contains all relevant functions and steps for generating a brand new deck based on duelingbook replays.
Read the notes in each section before running any code.
I do not recommend running this code, do so at your peril.

This code was last tested using <br>
Python 3.9.16 <br>
Last Test: 2023.03.02 <br>

### Notes
This notebook cannot scrape replays atm as db has updated their authorisation process.
I am looking for solutions.


### Set Up
In order to run this notebook, you'll need to install some libraries.
As per usual, I'd recommend you set up a new environment before carrying on with this project.
<br>
The necessary libraries are noted in requirements.txt
<br>
You will need to install ipykernel before you can run scripts in your environment, details here:
https://packaging.python.org/en/latest/tutorials/installing-packages/
<br>


In [None]:
# Complete deck builder robot:

#1. Scrape latest ygo card database - use ygoprodeck api
#2. Request card suggestions from user
#3. Grab replays
#4. Convert replays to hands
#5. Train the ML portion
#6. optimise the suggested main deck

# DO NOT RUN THIS CODE WITHOUT READING THROUGH THE INSTRUCTIONS
# ALL CODE IS RUN AT THE USER'S OWN RISK


In [None]:
# Constants

suggested_deck_path = ""
model_path = ""
start_folder = "/ygo-code-net-deck/" # Put the folder path here
project_path = 'folder path for where to save stored replays, must end in /' # Keep this separate from the rest of the code, otherwise it gets really crowded



In [None]:
#1
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import platform
import json


from tkinter import *
import datetime


import re
from os import listdir
from os.path import isfile, join
import os

from tqdm import tqdm
import multiprocessing
from joblib import Parallel, delayed

import codecs

import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
import pickle
from datetime import datetime



In [None]:

#1
database_save_path = start_folder +'Construct-Base-Deck/YGO-Card-Database/YGODatabase.pickle'
ygo_pro_api_url = 'https://db.ygoprodeck.com/api/v7/cardinfo.php'

#2
suggestions_save_folder = start_folder + 'Construct-Base-Deck/Decks/'

#3
db_replay_home_url = 'https://www.duelingbook.com/view-replay?id=-'

replay_folder_path = project_path + 'Replays'
compressed_replay_folder_path = project_path + 'Compressed_Replays'

log_name = 'log.txt'
log_path = replay_folder_path + '/' + log_name


#4
hand_save_folder = project_path + 'Hands'

#5
hand_file_path = hand_save_folder +"/Hands.csv"
model_folder_path = start_folder+'Model_Folder/'


In [None]:
# Functions - good programmers will put this in a separate file and import,
# but idk if you're going to save that in the right place, so here are all the functions
#1 ################################################################################################

def creation_date(path_to_file):
    """
    Try to get the date that a file was created, falling back to when it was
    last modified if that isn't possible.
    See http://stackoverflow.com/a/39501288/1709587 for explanation.
    """
    if platform.system() == 'Windows':
        return datetime.fromtimestamp(os.path.getctime(path_to_file))
    else:
        stat = os.stat(path_to_file)
        try:
            return datetime.fromtimestamp(stat.st_birthtime)
        except AttributeError:
            # We're probably on Linux. No easy way to get creation dates here,
            # so we'll settle for when its content was last modified.
            return datetime.fromtimestamp(stat.st_mtime)



def assign_ban_status(banlist_info_dict):
    if pd.isna(banlist_info_dict):
        return 'Unlimited'
    try:
        if banlist_info_dict['ban_tcg']=='Limited':
            return 'Limited'
        if banlist_info_dict['ban_tcg']=='Semi-Limited':
            return 'Semi-Limited'
        if banlist_info_dict['ban_tcg']=='Banned':
            return 'Forbidden'
        
    except:
        return 'Unlimited'
    


def save_ygo_database(save_path):

    response = json.loads(requests.get(ygo_pro_api_url).content)
    card_df = pd.DataFrame(response['data'])

    card_df['Name'] = card_df['name']
    card_df['Ban_Status'] = card_df['banlist_info'].apply(lambda x: assign_ban_status(x))
    extra_deck_types = ['link', 'synchro', 'xyz', 'fusion']
    card_df['Extra_Deck_Status'] = card_df['frameType'].apply(lambda x: 'Extra Deck' if any(extra_str in x for extra_str in extra_deck_types) else 'Main Deck')

    card_df[['Name', 'Extra_Deck_Status', 'Ban_Status']].to_pickle(save_path)



#2 ######################################################################################################


#Update list box
def update(data):
    #clear list box
    search_list.delete(0,END)
    
    #Add names to list box
    for item in data:
        search_list.insert(END, item)

#Update entry with list box click
def fillout(event):
    #clear entry box
    search_entry.delete(0, END)
    
    #add selected item to entry box
    search_entry.insert(0, search_list.get(ACTIVE))
    
    
#check if typed entry is in box below
def check(event):
    typed = search_entry.get()
    
    if typed == '':
        data = Names
    else:
        data = []
        for item in Names:
            if typed.lower() in item.lower():
                data.append(item)
    #update list box with selected items            
    update(data)
                
#add card to current deck list
def add_to_deck():
    start_label['text'] = 'Adding card...'
    typed = search_entry.get()
    
    if typed == '':
       
        start_label['text'] ='Please select one card.'
        
    else:
        selected = []
        
        #Check that the selected card is in the list
        for item in Names:
            if typed.lower() == item.lower():
                selected.append(item)
        
        if len(selected) == 1:
            for item in Deck_Names:
                if item == selected[0]:
                   
                    start_label['text'] = 'Already selected.'
                 
                    break
            else:
                Deck_Names.append(selected[0])
                
                start_label['text'] = 'Start Typing...'
        else:
           
            start_label['text'] ='Please select one card.'
       
            
    refresh_deck(Deck_Names)
    
    
    
def refresh_deck(data):
    #clear list box
    deck_list.delete(0, END)
    #add names to deck list box
    for item in data:
        deck_list.insert(END, item)
        
def remove(event):
    selected_card = deck_list.get(ACTIVE)
    Deck_Names.remove(selected_card)
    refresh_deck(Deck_Names)
    
#Save down deck 
def save_deck():
    global suggested_deck_path

    start_label['text'] = 'Saving...'
    deck_title = 'Draft_' + datetime.today().strftime('%Y-%m-%d') + '.csv'
    typed_title = deck_name_entry.get()
    
    if typed_title != '':
        deck_title = typed_title + '_'+ datetime.today().strftime('%Y-%m-%d') + '.csv'
        
    save_path = suggestions_save_folder + deck_title
    suggested_deck_path = save_path
    suggested_df = pd.concat([
        pd.DataFrame({'Names':Deck_Names, 'Engine_Requirement_Flag': [0]*len(Deck_Names)}),
        pd.DataFrame({'Names':Engine_Names, 'Engine_Requirement_Flag': [1]*len(Engine_Names)})
    ]
    )
    suggested_df.to_csv(save_path)  
    start_label['text'] = 'Saved.'



#Make Engine Requirement
def eng_req_card():
    start_label['text'] = 'Making Engine Requirement...'
    selected_card = deck_list.get(ACTIVE)
    Deck_Names.remove(selected_card)
    refresh_deck(Deck_Names)
    Engine_Names.append(selected_card)
    refresh_eng_reqs(Engine_Names)
    refresh_eng_reqs(Engine_Names)
    
def refresh_eng_reqs(data):
    #clear list box
    req_list.delete(0, END)
    #add names to engine requirements list box
    for item in data:
        req_list.insert(END, item)
        
def remove_req(event):
    selected_card = req_list.get(ACTIVE)
    Engine_Names.remove(selected_card)
    refresh_eng_reqs(Engine_Names)


#3 ##########################################################################################################################


def capture_replay(i):

        replay_id = str(i).zfill(8)
        print('checking ' + replay_id)

        db_url = db_replay_home_url + replay_id
        source = requests.get(db_url)

        if source.status_code == 200:
            print('Site exists... Parsing')
            soup = BeautifulSoup(source.text, 'lxml')

            body = soup.find('body')
            if body.p is not None:
                if ("Replay does not exist" not in body.p.text):
                    replay_text = body.p.text
                    try:
                        compress_replay_debug_mode(replay_text, compressed_replay_folder_path, i)
                    except:
                        print('Failed to parse and compress replay with ID:' + str(i))
                else:
                    print('Replay does not exist.')
            else:
                print('URL does not contain valid replay.')



def compress_replay_debug_mode(replay_text, save_folder, N, debug_switch=0):

    #Get replay_id
    replay_id = N
    #print(replay_id)
    #Get player names
    player1 = replay_text.split('\"player1\":\"')[1]
    player1 = player1[:player1.find('\","')]
    #print(player1)

    player2 = replay_text.split('\"player2\":\"')[1]
    player2 = player2[:player2.find('\","')]
    #print(player2)

    #Get player ratings
    player1_rating = replay_text.split('\"player1\":{\"rating\":')[1]
    player1_rating = player1_rating[:player1_rating.find(',\"')]
    player2_rating = replay_text.split('\"player2\":{\"rating\":')[1]
    player2_rating = player2_rating[:player2_rating.find(',\"')]
    #print(player1_rating)
    #print(player2_rating)

    #Get match info

    match_format_peek = replay_text.split('\"format\":"')[1]
    match_format = match_format_peek[:match_format_peek.find('","')]
    match_format = 'Format: '+ match_format


    tcg_ocg_peek = replay_text.split('\"rules\":"')[1]
    tcg_ocg = tcg_ocg_peek[:tcg_ocg_peek.find('","')]
    tcg_ocg = 'Rules: '+tcg_ocg

    match_type_peek = replay_text.split('\"match_type\":"')[1]
    match_type = match_type_peek[:match_type_peek.find('","')]
    match_type = 'Match Type: '+ match_type

    #Get date
    date = replay_text[replay_text.find('date\":\"')+7:replay_text.find('date\":\"')+17]
    #print(date)

    lines = [str(replay_id),
            'Date: '+str(date),
            str(player1)+' rated: '+str(player1_rating),
            str(player2)+' rated: '+str(player2_rating),
            match_format,
            tcg_ocg,
            match_type]

    #Get actions and player names
    public_actions = replay_text.split('\"public_log\":') #[:a.find('\",\"')]
    actions = []
    for c in range(len(public_actions)):
        i = public_actions[c]
        j = i[:i.find(',\"')]+'|'+i[i.find('\"username\":')+11:i.find('}')]
        if ('Chose to go' in j) or ('Finished siding' in j) or ('Admitted defeat' in j) or ('Admitted defeat' in j) or ('Left duel' in j) or ('Drew ' in j):
            if not j.startswith('\"\\"'): # this removes chat
                j = re.sub(' +\([0-9].+[0-9]+\)','',j) #don't care which card from hand
                j = re.sub(' \(now [0-9]*\)', '',j)
                if "Drew a card" in j:
                    #print("Found: Drew a card...\n\n")
                    priv_slice = public_actions[c-1]
                    priv_log = priv_slice[priv_slice.find('\"private_log\":')+14:priv_slice.rfind('\"\",')]+'\"\"|'+i[i.find('\"username\":')+11:i.find('}')]
                    #print(priv_log)
                    actions.append(priv_log)
                else:
                    #print(j)
                    actions.append(j)


    lines = lines + actions

    f = open(save_folder + '/'+str(N)+'.txt', "w")
    f.write('')
    f.close()
    f = open(save_folder + '/'+str(N)+'.txt', "a")
    for line in lines:
        if debug_switch == 1:
            print(line)
        else:
            f.write(line+'\n')
    
    f.close()

    print('...Parsed')

#4 ########################################################################################################


def handify_replay(file_path):
    """turn replay text into a list of data frames containing hands and match information"""
    #print(file_path)
    try:
        hands_list = []

        f = open(file_path, "r")
        replay_text = f.read()
        f.close()

        replay_events = replay_text.split('\n')

        replay_id = int(replay_events[0])

        player1 = replay_events[2]
        player1_name = player1[:player1.find(' rated:')]
        player1_rating = int(player1[player1.rfind(' rated:')+7:])

        player2 = replay_events[3]
        player2_name = player2[:player2.find(' rated:')]
        player2_rating = int(player2[player2.rfind(' rated:')+7:])

        match_format = replay_events[4]
        match_format = match_format[match_format.rfind('Format: ')+8:]

        rules = replay_events[5]
        rules = rules[rules.rfind('Rules: ')+7:]

        match_type = replay_events[6]
        match_type = match_type[match_type.rfind('Match Type: ')+12:]

        winners_choice = replay_events[8]
        winners_choice = winners_choice[winners_choice.rfind('Chose to go ')+len('Chose to go '):winners_choice.find('"|"')]

        replay_date = replay_events[1]
        replay_date = replay_date[replay_date.rfind(':')+2:]

        game_split = [a + 'Finished siding' for a in replay_text.split('Finished siding')]

        del game_split[1::2]

        game_count = len(game_split)

        for game_number in range(game_count):
            game = game_split[game_number]
            #print('reading a game')
            game_events = game.split('\n')
            

            #Find winner (loser) of game
            loser_line = "Loser not found"
            for line in game_events:
                if 'Admitted defeat' in line:
                    loser_line = line
            if loser_line == "Loser not found":
                for line in game_events:
                    if "Left duel" in line:
                        loser_line = line
                        break #exits after finding first person to leave
            if player1_name in loser_line:
                loser = 'player1'
            if player2_name in loser_line:
                loser = 'player2'
            #print(loser_line)
            
            #Split the game between the two players 

            player1_actions = []
            player2_actions = []

            for line in game_events:
                if player1_name in line:
                    player1_actions.append(line)
                else:
                    player2_actions.append(line)
            
            #Find opening hand of player 1 and create df for player 1 hands

            draw_events = []

            for line in player1_actions:
                if "Drew" in line:
                    draw_events.append(codecs.getdecoder("unicode_escape")(line[8:line.rfind('\\\\""')])[0])
                    #print(line)
            
            if loser == 'player1':
                winner_flag = 0
            else:
                winner_flag = 1

            base_df = pd.DataFrame(
                {
                    'WinnerFlag'    :   winner_flag,
                    'Player'        :   player1_name,
                    'Opponent'      :   player2_name,
                    'Rating'        :   player1_rating,
                    'OppRating'     :   player2_rating,
                    'Format'        :   match_format,
                    'Rules'         :   rules,
                    'MatchType'     :   match_type,
                    'ReplayID'      :   replay_id,
                    'GameNumber'    :   game_number,
                    'Date'          :   replay_date
                }, index=[0]
            )


            hand_dfs = []
            for card in draw_events[:5]:
                #print(card)
                new_df = pd.DataFrame({
                    'CardName':card,
                    'CardCount':1
                    }, index = [0])
                hand_dfs.append(new_df)
            
            hand_df = pd.concat(hand_dfs)
    
            hand_df = hand_df.groupby('CardName').agg(
                CardCount = pd.NamedAgg(column = 'CardCount', aggfunc = 'sum')
            ).reset_index()

            hand_df['WinnerFlag'] = winner_flag
            hand_df['Player'] = player1_name
            hand_df['Opponent'] = player2_name
            hand_df['Rating'] = player1_rating
            hand_df['OppRating'] = player2_rating
            hand_df['Format'] = match_format
            hand_df['Rules'] = rules
            hand_df['MatchType'] = match_type
            hand_df['ReplayID'] = replay_id
            hand_df['GameNumber'] = game_number
            hand_df['Date'] = replay_date
            
            final_df1 = hand_df
            

            #Find opening hand of player 2 and create df for player 2 hands

            draw_events = []

            for line in player2_actions:
                if "Drew" in line:
                    draw_events.append(codecs.getdecoder("unicode_escape")(line[8:line.rfind('\\\\""')])[0])
                    #print(line)
            
            if loser == 'player2':
                winner_flag = 0
            else:
                winner_flag = 1

            base_df = pd.DataFrame(
                {
                    'WinnerFlag'    :   winner_flag,
                    'Player'        :   player2_name,
                    'Opponent'      :   player1_name,
                    'Rating'        :   player2_rating,
                    'OppRating'     :   player1_rating,
                    'Format'        :   match_format,
                    'Rules'         :   rules,
                    'MatchType'     :   match_type,
                    'ReplayID'      :   replay_id,
                    'GameNumber'    :   game_number,
                    'Date'          :   replay_date
                }, index=[0]
            )


            hand_dfs = []
            for card in draw_events[:5]:
                #print(card)
                new_df = pd.DataFrame({
                    'CardName':card,
                    'CardCount':1
                    }, index = [0])
                hand_dfs.append(new_df)
            
            hand_df = pd.concat(hand_dfs)
            hand_df = hand_df.groupby('CardName').agg(
                CardCount = pd.NamedAgg(column = 'CardCount', aggfunc = 'sum')
            ).reset_index()

            hand_df['WinnerFlag'] = winner_flag
            hand_df['Player'] = player2_name
            hand_df['Opponent'] = player1_name
            hand_df['Rating'] = player2_rating
            hand_df['OppRating'] = player1_rating
            hand_df['Format'] = match_format
            hand_df['Rules'] = rules
            hand_df['MatchType'] = match_type
            hand_df['ReplayID'] = replay_id
            hand_df['GameNumber'] = game_number
            hand_df['Date'] = replay_date


            final_df2 = hand_df

            hands_list.append(final_df1)
            hands_list.append(final_df2)

            hands_df = pd.concat(hands_list).reset_index()
        return hands_df
    except:
        #print(file_path)
        return pd.DataFrame(columns = ['WinnerFlag'])


#5 ################################################################

def train_hand_classifier(hand_file_path, suggested_deck_path, database_path, model_folder_path):

    global model_path
    global X_train_path
        
    hand_data = pd.read_csv(hand_file_path).fillna(0)

    factors = hand_data.columns.to_list()[10:]

    #Filtering

    #Only want advanced rated TCG:

    advanced_rate_answer = input("Only review Advanced Rated matches? [Y/N] ")
    while advanced_rate_answer not in ['Y', 'N']:
        advanced_rate_answer = input("Please only give capitalised answers: 'Y' or 'N'... ")
    TCG_answer = input("Only review TCG matches? [Y - TCG only/N - OCG or TCG] ")
    while TCG_answer not in ['Y', 'N']:
        TCG_answer = input("Please only give capitalised answers: 'Y' or 'N'... ")
    
    
    learning_data = hand_data.copy()

    learning_data['ReplayID'] = learning_data['ReplayID'].astype(int)
    learning_data['GameNumber'] = learning_data['GameNumber'].astype(int)
    learning_data['WinnerFlag'] = learning_data['WinnerFlag'].astype(int)

    if advanced_rate_answer == 'Y':
        learning_data = learning_data[learning_data['Format']=='ar']

    if TCG_answer == 'Y':
        learning_data = learning_data[learning_data['Rules']=='TCG']

    #Only want latest banlist onwards?

    date_answer = input("Please give earliest suitable date for replays (YYYY-MM-DD) ")
    learning_data = learning_data[learning_data['Date']>=date_answer]


    #Only want cases where at least one relevant card is mentioned

    deck_cards = pd.read_csv(suggested_deck_path) # need to slim down to only main deck monsters
    db_cards = pd.read_csv(database_path)
    deck_cards = pd.merge(deck_cards, db_cards, how = 'inner', left_on = 'Names', right_on = 'Name')
    deck_cards = deck_cards[deck_cards['Extra_Deck_Status']=='Main Deck']

    card_list = deck_cards['Names'].to_list()

    learning_data['replay_game_id'] = learning_data.apply(lambda x: str(x['ReplayID'])+str(x['GameNumber']), axis=1)

    learning_data['unique_id'] = learning_data.apply(lambda x: str(x['replay_game_id'])+str(x['WinnerFlag']), axis=1)

    keep_id = []
    for card in card_list:
        new_ids = learning_data[learning_data['CardName']==card]['unique_id'].to_list()
        keep_id = keep_id + new_ids

        
    keep_id = list(set(keep_id))
    joiner = pd.DataFrame({'unique_id':keep_id})
    learning_data = pd.merge(left = learning_data, right = joiner, how = 'inner', on = 'unique_id')

    if len(learning_data)==0:
        print("Insufficient data to calculate deck success...")

    learning_data['CardName'] = learning_data['CardName'].apply(lambda x: 'Other' if (x not in card_list) else x)

    aggregated_data = learning_data.groupby(['CardName', 'unique_id']).agg(
        CardCount = pd.NamedAgg(column = 'CardCount', aggfunc = 'sum'),
        WinnerFlag = pd.NamedAgg(column = 'WinnerFlag', aggfunc = 'first'),
        Player = pd.NamedAgg(column = 'Player', aggfunc = 'first'),
        Opponent = pd.NamedAgg(column = 'Opponent', aggfunc = 'first'),
        Rating = pd.NamedAgg(column = 'Rating', aggfunc = 'first'),
        OppRating = pd.NamedAgg(column = 'OppRating', aggfunc = 'first'),
        Format = pd.NamedAgg(column = 'Format', aggfunc = 'first'),
        Rules = pd.NamedAgg(column = 'Rules', aggfunc = 'first'),
        MatchType = pd.NamedAgg(column = 'MatchType', aggfunc = 'first'),
        ReplayID = pd.NamedAgg(column = 'ReplayID', aggfunc = 'first'),
        GameNumber = pd.NamedAgg(column = 'GameNumber', aggfunc = 'first')
    )

    aggregated_data = aggregated_data.reset_index()

    aggregated_data.sort_values(by = ['unique_id']).head(4)

    final_data = aggregated_data.pivot(index='unique_id', columns = 'CardName', values='CardCount').fillna(0).reset_index()
    final_data['WinnerFlag'] = final_data['unique_id'].apply(lambda x: 0 if int(x)%2 == 0 else 1)


    #Limit to available cards if possible
    other_count = 0

    for i in range(6):
        available_row = len(final_data[final_data['Other']<=i])
        other_count = i
        if available_row>5000:
            break
            
    final_data = final_data[final_data['Other']<=other_count]


    #Random forest classifier
    card_list_keep = final_data.columns.to_list()[1:-1]
    #Set X and y
    X = final_data[card_list_keep]
    y = final_data['WinnerFlag']

    print('Identified '+str(len(X))+' suitable games.')

    #Train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42)

    #Prepare classifier
    clf = RandomForestClassifier()

    #Fit classifier
    clf.fit(X_train, y_train)



    #Assess the model

    y_pred = clf.predict(X_test)

    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)


    print('Learning data of ' + str(len(final_data)) + ' rows.\n Split 20% testing...')

    #find estimated overall win-rate for the deck
    win_rate_est = final_data['WinnerFlag'].mean()

    #suggest that AUC / win-rate be a better measure of how well the model is performing and request min of 70% - this is totally unjustified, but sounds nice
    model_strength = auc(fpr,tpr) / win_rate_est

    print('Relative Model Strength: ' + str(round(model_strength, 3)))

    if model_strength < 0.7:
        print('Struggling to establish strong model.\n You may experience unexpected optimization.')
    #Save the model
    deck_name = suggested_deck_path[suggested_deck_path.rfind('/')+1:-4]
    filename = model_folder_path + 'RandomForestClassifier_'+deck_name+'.sav'
    model_path = filename
    pickle.dump(clf, open(filename, 'wb'))

    X_train.to_csv(model_folder_path +'RandomForestClassifier_'+deck_name+'X_train.csv', index=False)
    y_train.to_csv(model_folder_path +'RandomForestClassifier_'+deck_name+'y_train.csv', index=False)

    X_train_path = model_folder_path +'RandomForestClassifier_'+deck_name+'X_train.csv'
    print('Model saved.')



#6 ##########################################################################################



def generate_random_deck(main_suggested_cards):
    """Takes a df containing names of main deck cards alongside a min and max count for each and produces a randomised frame of cards and counts.
    This deck will be between 40 and 60 cards in size (inclusive)."""

    while 1==1: # code writing 101 says never do this, fuck that
        new_deck = main_suggested_cards.copy()
        new_deck['Count'] = new_deck.apply(lambda x: np.random.randint(x['Min_Count'], high = x['Max_Count']+1), axis = 1)
        #new_deck = new_deck[new_deck['Count']>0].reset_index()
        if (new_deck['Count'].sum() >= 40) & (new_deck['Count'].sum()<=60):
            break

    return new_deck



def reformatDeckDataFrame(deck_frame):
    """Expand input deck data frame with columns Name and Count"""
    deck_frame = deck_frame.reset_index()
    N = deck_frame.shape[0]
    
    #Create a base data frame in which to append the other rows
    rowName = deck_frame.at[0, "Name"]
    rowNumber = int(deck_frame.at[0, "Count"])
    nameList = [rowName]*rowNumber
    rowDict = {"Name": nameList}
    reformattedDataFrame = pd.DataFrame(rowDict)
   

    for rowCount in range(1,N):
        #Assemble the details of the row
        rowName = deck_frame.at[rowCount, "Name"]
        rowNumber = int(deck_frame.at[rowCount, "Count"])
        nameList = [rowName]*rowNumber
        rowDict = {"Name": nameList}
        newRow = pd.DataFrame(rowDict)
        #add the new row to the data frame
        reformattedDataFrame = reformattedDataFrame.append(newRow, ignore_index=True)

    return reformattedDataFrame



def drawHand(deck_frame):
    """Take in an initial deck frame, randomise it, then grab the top 5 cards and reformat for hand analysis"""
    #reformat and randomise the deck
    reformattedDeckFrame = reformatDeckDataFrame(deck_frame)
    randomisedDeckFrame = reformattedDeckFrame.sample(frac=1)
    #define sizes of deck and hand
    handSize = 5
    # grab hand
    hand_frame = randomisedDeckFrame.head(handSize)
    # turn hand frame into a hand (i.e. one card per column)
    hand_dfs = []
    for card in hand_frame['Name'].to_list():
        new_df = pd.DataFrame({
                'CardName':card,
                'CardCount':1
                }, index = [0])
        hand_dfs.append(new_df)

    hand_df = pd.concat(hand_dfs)
    hand_df = hand_df.groupby('CardName').agg(
            CardCount = pd.NamedAgg(column = 'CardCount', aggfunc = 'sum')
        )
    final_data = hand_df.pivot_table(columns = 'CardName', values='CardCount').fillna(0).reset_index()  
    final_data = final_data.drop( columns='index')
    return final_data


def test_hand(hand_frame, model, X_train_cols_list):
    """Takes a hand data frame as an input and predicts a probability of winning using an imported classifier."""
    #First need to add extra columns for cards used in the model, but not in the hand
    new_df = pd.DataFrame(columns = X_train_cols_list)
    hand_frame['Other'] = 0 
    for col in hand_frame:
        if col not in X_train_cols_list:
            hand_frame['Other'] = hand_frame.apply(lambda x: x['Other'] + x[col], axis = 1)
            hand_frame = hand_frame.drop(columns = col)
    #print(hand_frame)
    new_df = pd.concat([new_df, hand_frame])
    new_df = new_df.fillna(0)
    #print(new_df)

    return np.float(model.predict_proba(new_df)[:,1])

def test_deck_once(deck_frame, model, X_train_cols_list):
    return test_hand(drawHand(deck_frame), model, X_train_cols_list)

def test_deck(deck_frame, N, model, X_train_cols_list):
    """Check the success of a deck frame over a series of N runs, obtain the average score, the min, and the max."""
    Scores = Parallel(n_jobs = 8)(delayed(test_deck_once)(deck_frame, model, X_train_cols_list) for i in range(0,N))

    return np.mean(Scores)


def test_n_hands(deck_frame, N, model, X_train_cols_list):

    hand_list = Parallel(n_jobs = 9)(delayed(drawHand)(deck_frame) for i in range(0,N))

    hands_df = pd.concat(hand_list)
    card_names = hands_df.columns
    other_cols = []
    for col in card_names:
        if col not in X_train_cols_list:
            other_cols.append(col)

    hands_df['Other'] = hands_df.apply(lambda x: x[other_cols].sum(), axis=1)
    hands_df = hands_df.drop(other_cols, axis = 1)

    new_df = pd.DataFrame(columns = X_train_cols_list)
    new_df = pd.concat([new_df, hands_df]).fillna(0)

    return np.float(np.mean(model.predict_proba(new_df)[:,1]))

    

def breed_decks(deck_a, deck_b, mutation_rate = 0):
    """Takes two decks as inputs and randomly combines them to produce offspring."""
    
    deck_a = deck_a.reset_index()
    deck_b = deck_b.reset_index()
    child_deck = deck_a.copy()


    if len(deck_a) != len(deck_b):
        print('Incompatible decks selected to be bred, please abort...')
    
    while 1==1:
        for i in range(len(child_deck)):
            if np.random.uniform()>=0.5:
                child_deck.loc[i,:] = deck_b.loc[i,:]
            if np.random.uniform()<mutation_rate:
                #print("Mutation occurred!")
                #print(child_deck.loc[i, 'Min_Count'])
                child_deck.loc[i,'Count'] = np.random.randint(child_deck.loc[i, 'Min_Count'], high = child_deck.loc[i, 'Max_Count']+1)
        if (child_deck['Count'].sum()>=40) & (child_deck['Count'].sum()<=60):
            break
    child_deck = child_deck.set_index('index')
    return child_deck
        

def optimise_deck(deck_suggestion_path, model_path, database_path, num_init, num_hands, num_generations, num_children_per_pair = 20, mutation_rate = 0):
    """Generate optimised deck by the following seteps"""
    #1. create num_init many randomly generated decks and score each one across num_hands hands
    #2. pick the top 10 and use these as parents
    #3. breed the parents with each pair producing 20 children (45 combinations -> 900 children) using a randomly generated mutation following mutation_rate
    #4. repeat breeding for num_generations
    #5. Output the final top 10

    X_train_path = model_path[:model_path.find('.sav')]+'X_train.csv'

    #0. prepare initial card suggestions and model

    suggested_cards_frame = pd.read_csv(deck_suggestion_path)[['Names', 'Engine_Requirement_Flag']]
    db_cards = pd.read_csv(database_path)[['Name', 'Ban_Status', 'Extra_Deck_Status']] # need to confirm meta data of this to get correct card legality
    X_train = pd.read_csv(X_train_path)
    clf = pickle.load(open(model_path, 'rb'))

    X_train_cols = X_train.columns.to_list()

    # Merge to get card status
    suggested_cards = pd.merge(left = suggested_cards_frame, right = db_cards, left_on = 'Names', right_on = 'Name', how = 'inner')

    if len(suggested_cards) < len(suggested_cards_frame):
        print('Error: Could not find some cards on the ygo database download... \nPlease abort and re-download the card database.')

    suggested_cards['Max_Count'] = suggested_cards['Ban_Status'].apply(lambda x: 3 if x == 'Unlimited' else (2 if x == 'Semi-limited' else (1 if x=='Limited' else 0)))
    suggested_cards['Min_Count'] = suggested_cards['Engine_Requirement_Flag'].apply(lambda x: 1 * x)
    main_suggested_cards = suggested_cards[suggested_cards['Extra_Deck_Status']=='Main Deck']

    #1.
    decks = []
    scores = []
    deck_id = []
    for i in range(num_init):
        #print("Initiating deck "+ str(i))
        deck_id.append(i)
        deck = generate_random_deck(main_suggested_cards)
        decks.append(deck)
    
    print("Generated "+ str(num_init)+" initial decks")

    for i in tqdm(range(len(decks))):
        #print("Testing deck "+str(i))
        deck = decks[i]
        scores.append(test_n_hands(deck, num_hands, clf, X_train_cols))#test_deck(deck, num_hands, clf, X_train_cols))

    deck_id_df = pd.DataFrame({
        "Deck_ID":deck_id,
        "Score":scores
    }).sort_values(by = ['Score'], ascending = False)
    
    print("Tested "+ str(num_init)+" initial decks")
    
    #2.
    top_decks = deck_id_df.head(10)['Deck_ID'].to_list()
    parents = [decks[i] for i in top_decks]

    #4.
    for g in range(num_generations):
        print("Beginning generation "+str(g+1))

        #3.
        children = []
        children_scores = []
        child_id = []
        idx = 0
        for i in tqdm(range(len(parents))):
            for j in range(i+1, len(parents)):
                parent_a = parents[i]
                parent_b = parents[j]

                for k in range(num_children_per_pair):
                    
                    child = breed_decks(parent_a, parent_b, mutation_rate)
                    children.append(child)
                    children_scores.append(test_n_hands(child, num_hands, clf, X_train_cols))#test_deck(child, num_hands, clf, X_train_cols))
                    child_id.append(idx)
                    #print("Bred child "+ str(idx))
                    idx +=1

                    
        child_id_df = pd.DataFrame({
            "Child_ID": child_id,
            "Score": children_scores
        }).sort_values(by = ['Score'], ascending = False)
        
        top_decks = child_id_df.head(10)['Child_ID'].to_list()
        parents = [children[i] for i in top_decks]
        print("Strongest deck IDs:\n")
        print([str(element) for element in top_decks])
    
    #5.
    #provide scores and parents

    return child_id_df.head(10), parents








In [None]:
#1 #############################################################################################################################

save_ygo_database(database_save_path)


In [None]:
#2 #############################################################################################################################
df = pd.read_pickle(database_save_path)


Names = df['Name'].to_list()


root = Tk()
root.title('Deck Builder - Card Entry Window')
root.geometry('750x500')


start_label = Label(root, text='Start Typing...',
                font = ('Helevetica', 14), fg = 'grey')
start_label.grid(row = 0, column = 0, padx = 10, pady = 10)

search_entry = Entry(root, font=('Helevetica', 20))
search_entry.grid(row=1, column = 0, padx = 10, pady = 10)


search_list = Listbox(root, width = 40)
search_list.grid(row = 2, column = 0, padx = 10, pady = 10)

add_button = Button(root, text='Add to Deck', command = add_to_deck)
add_button.grid(row = 3, column = 0, padx = 10, pady = 10)

deck_label = Label(root, text='Current Deck',
                font = ('Helevetica', 14), fg = 'grey')
deck_label.grid(row=0, column = 1, padx = 10, pady = 10)

deck_list = Listbox(root, width = 40)
deck_list.grid(row = 2, column = 1, padx = 10, pady = 10)


deck_name_entry = Entry(root, font=('Helevetica', 20))
deck_name_entry.grid(row = 1, column = 1, padx=10, pady=10)


req_list = Listbox(root, width = 40)
req_list.grid(row = 2, column = 2, padx=10, pady=10)

engine_requirement = Button(root, text = 'Make Engine Requirement', command = eng_req_card)
engine_requirement.grid(row = 3, column = 2, padx = 10, pady = 10)

save_button = Button(root, text = 'Save Deck', command = save_deck)
save_button.grid(row = 3, column = 1, padx = 10, pady = 10)

end_label = Label(root, text = 'Please Include Extra Deck Monsters.')
end_label.grid(row = 4, column = 0, padx = 10, pady = 10)


Deck_Names = []

Engine_Names = []

update(Names)

refresh_deck(Deck_Names)

refresh_eng_reqs(Engine_Names)

status = 'Ready'


#Create binding on list box onclick
search_list.bind("<<ListboxSelect>>", fillout)

#Create binding on entry box
search_entry.bind("<KeyRelease>", check)

#Create binding on current deck list for deleting cards
deck_list.bind('<Double-Button-1>', remove)


#Create binding on current engine requirements list for deleting cards
req_list.bind('<Double-Button-1>', remove_req)


root.mainloop()


### This is the part which doesn't work atm
### db updated the way access is authorised... working on a solution when I have time

In [None]:


num_cores = multiprocessing.cpu_count()


#3 #############################################################################################################################

update_replay_store = input("Extract replays from duelingbook.com? [Y/N]")
while update_replay_store not in ['Y', 'N']:
    update_replay_store = input("Please only input capitalised 'Y' or 'N'...")

if update_replay_store == 'Y':
    update_replay_store_count = input("How many new 1000s of generic replays should be captured? \n(.e.g if you input 1, 1000 new replays will be parsed)")
    while isinstance(int(update_replay_store_count), int) == False:
        update_replay_store_count = input("Please only give an integer or 0 number of 1000s to parse...")
    
    Replay_Parse_Count = int(update_replay_store_count)
    counter =  1
    while counter <= Replay_Parse_Count:
        print(f'Parsing set {counter} of {Replay_Parse_Count}')


        if os.path.isfile(log_path) == True:
            l = open(log_path)
            log_text = l.read()
            N = int(log_text[log_text.find('N = ')+4:])
            l.close()
        else:
            print("No log found, will parse replays beginning from ID = 46000000")
            N = 32000000
        Parallel(n_jobs=10)(delayed(capture_replay)(i) for i in tqdm(range(N,N+1000)))
       
        l = open(log_path, 'w')
        l.write('Parsing ended at N = ' + str(N+1000))
        l.close()

        counter +=1


In [None]:
from tqdm import tqdm
import multiprocessing
from joblib import Parallel, delayed
#4 #############################################################################################################################
file_names = [f for f in listdir(compressed_replay_folder_path) if isfile(join(compressed_replay_folder_path, f))]
for i in range(len(file_names)):
    file_names[i] = compressed_replay_folder_path + '/' + file_names[i]

print(str(len(file_names)) + ' files found...')
print('____________________________\n\n_____________________________')   


In [None]:
# This bit of code can break weaker computers - run a small number of jobs and only look at a small number of replays unless your computer can handle it
# This is so suboptimal that it nearly used up 32GB of RAM one time
hands_list = []
job_count = 4
hands_list = Parallel(n_jobs=4)(delayed(handify_replay)(file_names[i]) for i in tqdm(range(len(file_names))))

hands_df = pd.concat(hands_list).reset_index()
hands_df.to_csv(hand_save_folder + '/Hands.csv', index=False)


In [None]:

#5 #############################################################################################################################
from tkinter import filedialog as fd
print('Select a deck')
filename = fd.askopenfilename()

train_hand_classifier(hand_file_path, filename, database_save_path, model_folder_path)



In [None]:
from tkinter import filedialog as fd
from tqdm import tqdm
import multiprocessing
from joblib import Parallel, delayed


#6 #############################################################################################################################

print("Now, we will optimise a deck list based on the captured data and model.")

print("\n\nSelect your deck.")
suggested_deck_path = fd.askopenfilename()

deck_name = suggested_deck_path[suggested_deck_path.rfind('/')+1:-4]
model_path = model_folder_path + 'RandomForestClassifier_'+deck_name+'.sav'


num_init = 1000
num_hands = 10000
num_generations = 8
num_children = 4
mutation_rate = 0.005

id_frame, results = optimise_deck(suggested_deck_path, model_path, database_save_path, num_init, num_hands, num_generations, num_children_per_pair = num_children, mutation_rate = mutation_rate)


#7 ##############################################################################
print('Your optimised deck lists have been found!')
print('\n\nThe highest scoring deck received an estimated win rate, regardless of skill or opponent, of '+str(id_frame.head(1)['Score']))
print('\n\nYour optimised deck list looks like this: ')
results[0]

In [None]:
for i in range(len(results)):
    df = results[i]
    df.to_csv(model_folder_path + 'RandomForestClassifier_'+deck_name+'_Results_'+str(i)+'.csv')

id_frame.to_csv(model_folder_path + 'RandomForestClassifier_'+deck_name+'_Results_id_frame.csv')