Skip to content

Commit

Permalink
Added new tests to the testing framework.
Browse files Browse the repository at this point in the history
Added the ability to remove null columns from the export via the collapse parameter.
Added the ability to generate a csv of just the top level games information and exclude the moves file.
Add the ability to configure the queue size of the blocking queue that is used to generate the moves file for each game concurrently as the process processes each game in the pgn file.
Minor code enhancements throughout
  • Loading branch information
zq99 committed Jul 5, 2023
1 parent 2526e04 commit b68070c
Show file tree
Hide file tree
Showing 46 changed files with 84,276 additions and 47,826 deletions.
2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/pgn2data.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 16 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ The result object also provides methods to import the created files into pandas
combined_df = result.get_combined_df()
print(combined_df.head())

To output the game information only, you can do the following:

from converter.pgn_data import PGNData

pgn_data = PGNData("tal_bronstein_1982.pgn")
pgn_data.export(moves_required=False)


## Examples

Expand Down Expand Up @@ -138,30 +145,33 @@ This is a full list of the columns in each output file:
| is_check | Is check on board |
| is_check_mate | Is checkmate on board |
| is_fifty_moves | Is 50 move complete |
| is_fivefold_repetition | Is 5 fold reptition on board |
| is_fivefold_repetition | Is 5 fold repetition on board |
| is_game_over | Is game over |
| is_insufficient_material | Is game over from lack of mating material |
| white_count | Count of white pieces |
| black_count | Count of black pieces |
| white_{piece}_count | Count of white specifed piece |
| black_{piece}_count | Count of black specifed piece |
| white_{piece}_count | Count of white specified piece |
| black_{piece}_count | Count of black specified piece |
| captured_score_for_white | Total of black pieces captured |
| captured_score_for_black | Total of white pieces captured |
| fen_row{number}_{colour)_count | Number of pieces for the specified colour on this row of the board |
| fen_row{number}_{colour}_value | Total value of pieces for the specified colour on this row of the board |
| move_sequence | Sequence of moves upto current position |
| move_sequence | Sequence of moves up to current position |


## Contributions

Contributions are welcome, all modifications should come with appropriate tests demonstrating
an issue has been resolved, or new functionality is working as intended.
an issue has been resolved, or new functionality is working as intended. Pull Requests without tests
will not be merged.

All tests can be run by doing the following:
The library can be tested by doing the following:

from testing.tests import run_all_tests
run_all_tests()

New tests should be added to the above method.


## Acknowledgements

Expand Down
106 changes: 77 additions & 29 deletions converter/pgn_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import ntpath
import os.path
import pandas as pd

from common.common import open_file
from common.log_time import TimeProcess
Expand All @@ -10,14 +11,18 @@
log = logging.getLogger("pgn2data - pgn_data class")
logging.basicConfig(level=logging.INFO)

DEFAULT_MOVES_REQUIRED = True
DEFAULT_QUEUE_SIZE = 0
DEFAULT_COLLAPSE = False


class PGNData:
"""
Main class to handle the library's methods
examples of how to call:
(1) p = PGNData("tal_bronstein_1982.pgn","test")
(2) p = PGNData("tal_bronstein_1982.pgn")
(3) p = PGNData(["tal_bronstein_1982.pgn","tal_bronstein_1982.pgn"],"myfilename")
(3) p = PGNData(["tal_bronstein_1982.pgn","tal_bronstein_1982.pgn"],"MyFilename")
(4) p = PGNData(["tal_bronstein_1982.pgn","tal_bronstein_1982.pgn"])
p.export()
Expand All @@ -38,33 +43,43 @@ def set_engine_depth(self, depth):
else:
log.error("Invalid engine depth specified: " + str(depth))

def export(self):
def export(self, moves_required: bool = DEFAULT_MOVES_REQUIRED, queue_size: int = DEFAULT_QUEUE_SIZE, collapse: bool = DEFAULT_COLLAPSE):
"""
main method to convert pgn to csv
:parameter moves_required - if true a games and moves file is created
:parameter queue_size - this is the max_size of the blocking queue when processing moves
:parameter collapse - this removes any null columns from the final files
"""

if not isinstance(moves_required, bool):
raise TypeError("moves_required must be a bool")
if not isinstance(queue_size, int) or queue_size < 0:
raise ValueError("queue_size must be an int greater or equal to 0")
if not isinstance(collapse, bool):
raise TypeError("collapse must be a bool, when True it will remove null columns")

timer = TimeProcess()
result = Result.get_empty_result()
if isinstance(self._pgn, list):
if not self.__is_valid_pgn_list(self._pgn):
log.error("no pgn files found!")
return result
file = self.__create_file_name(self._pgn[0]) if self._file_name is None else self._file_name
result = self.__process_pgn_list(self._pgn, file)
elif isinstance(self._pgn, str):
if not os.path.isfile(self._pgn):
log.error("no pgn files found!")
return result
pgn_list = [self._pgn]
file = self.__create_file_name(self._pgn) if self._file_name is None else self._file_name
result = self.__process_pgn_list(pgn_list, file)

pgn_list = self._pgn if isinstance(self._pgn, list) else [str(self._pgn)]
file_name = self._pgn[0] if isinstance(self._pgn, list) and len(self._pgn) > 0 else str(self._pgn)

if not self.__is_valid_pgn_list(pgn_list):
log.error("No valid pgn file(s) found to convert to csv!")
return result

full_file_name = self.__create_file_name(file_name) if self._file_name is None else self._file_name
result = self.__process_pgn_list(pgn_list, full_file_name, moves_required, queue_size, collapse)

timer.print_time_taken()
return result

@staticmethod
def __create_file_name(file_path):
return ntpath.basename(file_path).replace(".pgn", "")

def __process_pgn_list(self, file_list, output_file=None):
def __process_pgn_list(self, file_list, output_file=None, moves_required=DEFAULT_MOVES_REQUIRED,
queue_size=DEFAULT_QUEUE_SIZE, collapse=DEFAULT_COLLAPSE):
"""
This takes a PGN file and creates two output files
1. First file contains the game information
Expand All @@ -76,30 +91,56 @@ def __process_pgn_list(self, file_list, output_file=None):
result = Result.get_empty_result()

file_name_games = output_file + '_game_info.csv'
file_name_moves = output_file + '_moves.csv'

file_games = open_file(file_name_games)
file_moves = open_file(file_name_moves)

if file_games is None or file_moves is None:
log.info("No data exported!")
if moves_required:
file_name_moves = output_file + '_moves.csv'
file_moves = open_file(file_name_moves)
export_files_initialized = (file_games is not None) and (file_moves is not None)
else:
file_name_moves, file_moves = None, None
export_files_initialized = file_games is not None

if not export_files_initialized:
log.info("Could not initialize the csv files to export the data into!")
return result

add_headers = True
for file in file_list:
process = Process(file, file_games, file_moves, self._engine_path, self._depth)
process = Process(file, file_games, file_moves, self._engine_path, self._depth, moves_required, queue_size)
process.parse_file(add_headers)
add_headers = False

file_games.close()
file_moves.close()
if moves_required:
file_moves.close()

# remove any null columns
if collapse:
self.__remove_empty_columns(file_name_games)
self.__remove_empty_columns(file_name_moves)

# return a result object to indicate outcome
result = self.__get_result_of_output_files(file_name_games, file_name_moves)
result = self.__get_result_of_output_files(file_name_games, file_name_moves, moves_required)

log.info("ending process..")
return result

@staticmethod
def __remove_empty_columns(file_name):
# Load the CSV file
if isinstance(file_name, str):
if os.path.isfile(file_name):
df = pd.read_csv(file_name)

# Remove columns where all values are NaN
df = df.dropna(axis=1, how='all')

# Overwrite the original CSV file
df.to_csv(file_name, index=False)

del df

@staticmethod
def __is_valid_pgn_list(file_list):
"""
Expand All @@ -113,16 +154,23 @@ def __is_valid_pgn_list(file_list):
return True
return False

def __get_result_of_output_files(self, game_file_name, moves_file_name):
def __get_result_of_output_files(self, game_file_name, moves_file_name=None, moves_required=DEFAULT_MOVES_REQUIRED) -> Result:
result = Result.get_empty_result()

try:
is_games_file_exists = os.path.isfile(game_file_name)
is_moves_file_exists = os.path.isfile(moves_file_name)
is_files_exists = is_games_file_exists and is_moves_file_exists
game_size = self.__get_size(game_file_name) if is_games_file_exists else 0
move_size = self.__get_size(moves_file_name) if is_moves_file_exists else 0
game_result = ResultFile(game_file_name, game_size)
move_result = ResultFile(moves_file_name, move_size)

if moves_required:
is_moves_file_exists = os.path.isfile(moves_file_name) if moves_file_name is not None else False
move_size = self.__get_size(moves_file_name) if is_moves_file_exists else 0
move_result = ResultFile(moves_file_name, move_size)
is_files_exists = is_games_file_exists and is_moves_file_exists
else:
is_files_exists = is_games_file_exists
move_result = None

result = Result(is_files_exists, game_result, move_result)
except Exception as e:
log.error(e)
Expand Down
45 changes: 40 additions & 5 deletions converter/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

class PlayerMove:
"""
data class to hold details of each move
PlayerMove = Data class to hold details of each move.
move = Move object from python chess library
notation = is algebraic notation of the move
"""
Expand Down Expand Up @@ -50,27 +50,62 @@ class Process:
Handles the pgn to data conversion
"""

def __init__(self, pgn_file, file_games, file_moves, engine_path, engine_depth):
def __init__(self, pgn_file, file_games, file_moves, engine_path, engine_depth, moves_required, queue_size=0):
self.pgn_file = pgn_file
self.file_games = file_games
self.file_moves = file_moves
self.engine_path = engine_path
self.engine_depth = engine_depth
self.max_queue_size = queue_size
self.moves_required = moves_required

def parse_file(self, add_headers_flag=True):
"""
processes on pgn file and then exports game information
This is the main method of the class for invoking the file processing
"""

if self.moves_required:
self.__parse_file_games_and_moves(add_headers_flag)
else:
self.__parse_file_games(add_headers_flag)

def __parse_file_games(self, add_headers_flag=True):
"""
processes the pgn file and then exports game information
into the game csv file, moves are ignored
"""

log.info("Processing games only in file:{}".format(self.pgn_file))
pgn = open(self.pgn_file, encoding="UTF-8")

game_writer = csv.writer(self.file_games, delimiter=',')
if add_headers_flag:
game_writer.writerow(file_headers_game)

order = 1
while True:
game_id = str(uuid.uuid4())
game = chess.pgn.read_game(pgn)
if game is None:
break # end of file

game_writer.writerow(self.__get_game_row_data(game, game_id, order, self.pgn_file))
order += 1

def __parse_file_games_and_moves(self, add_headers_flag=True):
"""
processes the pgn file and then exports game information
into the game csv file, and the moves into the moves csv file
"""

log.info("Processing file:{}".format(self.pgn_file))
log.info("Processing games and moves in file:{}".format(self.pgn_file))
pgn = open(self.pgn_file, encoding="UTF-8")

engine = None
if self.engine_path is not None:
engine = chess.engine.SimpleEngine.popen_uci(self.engine_path)

q = queue.Queue(maxsize=0)
q = queue.Queue(maxsize=self.max_queue_size)
worker = Thread(target=self.__process_move_queue, args=(q,))
worker.setDaemon(True)
worker.start()
Expand Down
17 changes: 11 additions & 6 deletions converter/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,30 @@ def print_summary(self):
"""
print("is complete: {}".format(str(self.is_complete)))
print("games file: {} | size: {}".format(self.games_file.name, self.games_file.size))
print("moves file: {} | size: {}".format(self.moves_file.name, self.moves_file.size))
if self.moves_file is not None:
print("moves file: {} | size: {}".format(self.moves_file.name, self.moves_file.size))

def get_games_df(self):
return self.__get_as_dataframe(self.games_file.name)

def get_moves_df(self):
return self.__get_as_dataframe(self.moves_file.name)
if self.moves_file is None:
return None
else:
return self.__get_as_dataframe(self.moves_file.name)

def get_combined_df(self):
games_df = self.get_games_df()
moves_df = self.get_moves_df()
if (games_df is not None) and (moves_df is not None):
if (not games_df.empty) and (not moves_df.empty):

if (games_df is not None) and (not games_df.empty):
if (moves_df is not None) and (not moves_df.empty):
combined_df = pd.merge(games_df, moves_df, on='game_id')
return combined_df
else:
log.error("one or both files is empty")
return games_df
else:
log.error("one or both files not found")
log.error("games information is missing or empty")
return None

def create_combined_file(self, filename):
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

setup(
name='pgn2data',
version='0.0.8',
version='0.0.9',
packages=['converter', 'common', 'testing'],
url='https://github.com/zq99/pgn2data',
python_requires=">=3.7",
Expand All @@ -23,7 +23,7 @@
author='zq99',
author_email='zq99@hotmail.com',
keywords=['CHESS', 'PGN', 'NOTATION', 'DATA', 'FORSYTH–EDWARDS NOTATION', 'CSV', 'DATASET', 'DATABASE',
'NORMALIZATION', 'TABULATION', 'STRUCTURED DATA', 'SQL', 'TABLE', 'EXCEL'],
'NORMALIZATION', 'TABULATION', 'STRUCTURED DATA', 'SQL', 'TABLE', 'EXCEL', 'PYTHON-CHESS'],
install_requires=[
'chess',
'pandas'
Expand Down
2 changes: 1 addition & 1 deletion testing/exports/basic_format_test_game_info.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
game_id,game_order,event,site,date_played,round,white,black,result,white_elo,white_rating_diff,black_elo,black_rating_diff,white_title,black_title,winner,winner_elo,loser,loser_elo,winner_loser_elo_diff,eco,termination,time_control,utc_date,utc_time,variant,ply_count,date_created,file_name
9514cb04-e7ba-4e5c-9078-1426010be4d4,1,F/S Return Match,"Belgrade, Serbia JUG",1992.11.04,29,"Fischer, Robert J.","Spassky, Boris V.",1/2-1/2,,,,,,,draw,,draw,,,,,,,,,,2022-10-28T21:34:12+0000,basic_format_test.pgn
632a6767-0a57-4159-ac5c-474f9fa4ffb0,1,F/S Return Match,"Belgrade, Serbia JUG",1992.11.04,29,"Fischer, Robert J.","Spassky, Boris V.",1/2-1/2,,,,,,,draw,,draw,,,,,,,,,,2023-07-05T12:07:45+0000,basic_format_test.pgn

0 comments on commit b68070c

Please sign in to comment.