# First Data Clean
## Oct. 28, 2021

In [63]:
import pandas as pd
import csv
import os
from glob import glob

Removing certain rows from data in order to put it in a more workable format. This does the following:

* removes summary rows, for both minors and majors
* removes "Notes" column, in minors
* removes Position and OPS+ and ERA+/FIP columns, in majors (to get same dimensionality as minors)

In [40]:
# Revises path to access the data we want

cwd = os.getcwd()
data_path = os.path.dirname(cwd) + '/Data'

# Gets path for every data file we have

all_csv_files = [file
                 for path, subdir, files in os.walk(data_path)
                 for file in glob(os.path.join(path, '*.csv'))]

In [41]:
# Gets list of paths for major and minor league teams
# This differs because the data is not the same for majors and minors

majors = [path for path in all_csv_files if 'MLB' in path]
minors = [path for path in all_csv_files if path not in majors]

In [55]:
lines = []

# Indices to exclude, corresponding to the "Notes" column

idx_bat = 25
idx_pitch = 30

# Loops over each file and removes the summary "Players" row
# Saves to "Clean" folder

for path in minors:
    
    with open(path, 'r') as readFile:
        reader = csv.reader(readFile)
        
        idx = idx_pitch if 'pitch' in path else idx_bat # getting correct index
        
        for row in reader:
            
            row_mod = row[:idx] + row[(idx+1):]
            lines.append(row_mod)
            
            for field in row_mod: # for each element in the row
                if 'Players' in field:
                    lines.remove(row_mod)

    with open(path.replace('Raw', 'Clean'), 'w') as writeFile:
        
        writer = csv.writer(writeFile)
        writer.writerows(lines)
        
    lines = [] # resets the list so that we don't keep appending more lines

In [66]:
lines = []

# Indices to exclude, corresponding to the OPS+ and ERA+/FIP columns

idx_bat = 20
idx_pitch = [25, 26]

# Strings that we'd exclude if they were player positions

str_excl = ['', '1', 'P']

# Loops over each file and removes the summary "Players" row
# Saves to "Clean" folder

for path in majors:
    
    with open(path, 'r') as readFile:
        reader = csv.reader(readFile)
        
        idx = idx_pitch if 'pitch' in path else idx_bat # getting correct indices
        
        for row in reader:
            
            pos = row[0] # grabbing to check if it's data we want
            
            if idx == idx_bat:
                if pos not in str_excl: # if it's not a pitcher hitting, or a summary row
                    row_mod = row[1:idx] + row[(idx+1):]
                    
            else: # if we're looking at pitcher data
                
                name = row[1] # because the summary rows are formatted differently
                
                if pos != str_excl[1] and ('Team Totals' not in name and 'Rank in' not in name):
                    
                    row_mod = row[1:idx[0]] + row[(idx[1]+1):]
            
            lines.append(row_mod)
            
    with open(path.replace('Raw', 'Clean'), 'w') as writeFile:
        
        # Gets unique lines because the above code is slightly buggy
        
        lines_unique = []
        
        for line in lines:
            if line not in lines_unique:
                lines_unique.append(line)
        
        writer = csv.writer(writeFile)
        writer.writerows(lines_unique)
        
    lines = [] # resets the list so that we don't keep appending more lines