# Second Data Clean
## Oct. 29, 2021

In [9]:
import pandas as pd
import csv
import os
from glob import glob
import re

Combining files cleaned in Oct. 28 file into one long file, for both pitchers and hitters.

In [2]:
# Revises path to access the data we want

cwd = os.getcwd()
data_path = os.path.dirname(cwd) + '/Data/Clean'

# Gets path for every data file we have

all_csv_files = [file
                 for path, subdir, files in os.walk(data_path)
                 for file in glob(os.path.join(path, '*.csv'))]

In [22]:
lines = []

# Loops over each file and adds a column for year, team, and level
# Saves to "Clean" folder

for path in all_csv_files:
    
    with open(path, 'r') as readFile:
        reader = csv.reader(readFile)
        
        # Getting the year, level, and organization from the file path
                         
        year = re.search('pitch/(.+?)/', path).group(1) if 'pitch' in path else re.search('bat/(.+?)/', path).group(1)
        level = re.search(f'{year}/(.+?)/', path).group(1)
        org = re.search(f'{level}_(.+?)_', path).group(1)
        
        for idx, row in enumerate(reader):
            
            if idx == 0: # adds the column header if it's the first row
                row_mod = row + ['Year', 'Level', 'Org']
                
            else: # otherwise adds the info we found above
                row_mod = row + [year, level, org]
                
            lines.append(row_mod)
            
    with open(path.replace('Raw', 'Clean'), 'w') as writeFile:
        
        writer = csv.writer(writeFile)
        writer.writerows(lines)
        
    lines = [] # resets the list so that we don't keep appending more lines

In [33]:
# Getting lists of file paths for hitters and pitchers

bat_path = data_path + '/bat'
pitch_path = data_path + '/pitch'

bat_files = [file
             for path, subdir, files in os.walk(bat_path)
             for file in glob(os.path.join(path, '*.csv'))]

pitch_files = [file
               for path, subdir, files in os.walk(pitch_path)
               for file in glob(os.path.join(path, '*.csv'))]

In [47]:
# Combining all batter info
# Removing the 'Notes' column which apparently wasn't fully deleted last time?

bat_all = pd.concat((pd.read_csv(f) for f in bat_files), ignore_index=True).drop(columns=['Notes'])

In [45]:
# Combining all pitcher info

pitch_all = pd.concat((pd.read_csv(f) for f in pitch_files), ignore_index=True)

In [51]:
# Writing these files to CSV

bat_all.to_csv(bat_path + '/bat_all.csv', index=False)
pitch_all.to_csv(pitch_path + '/pitch_all.csv', index=False)