In [1]:
#Import the packages needed
import csv
import os
import io
import pandas as pd
import pandas_gbq
import numpy as np
import zipfile

from zipfile import ZipFile
from google.cloud import bigquery
from google.oauth2 import service_account


In [None]:

# The files that have semicolons live in the semicolon file
zip_folder = 'semicolon_files'

# Put them back in a different folder once they're fixed
# Not a super clean way to do it, but I've done it now
output_folder = 'wedge_zips'

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
# Iterate over the zip files in the zip folder
for zip_file_name in os.listdir(zip_folder):

    #Check if the file ends with .zip so we know to unzip it
    if zip_file_name.endswith('.zip'):
        zip_file_path = os.path.join(zip_folder, zip_file_name)
        
        # Create a new zip file for writing the modified CSV
        output_zip_file_path = os.path.join(output_folder, zip_file_name)
        with ZipFile(output_zip_file_path, 'w') as output_zip:
            
            # Extract and process the CSV file
            with ZipFile(zip_file_path, 'r') as zf:
                for file_name in zf.namelist():
                    if file_name.endswith('.csv'):

                        #keep the file name the same
                        csv_file_name = file_name
                        with zf.open(csv_file_name, 'r') as input_file:
                            input_file = io.TextIOWrapper(input_file, encoding="utf-8")
                            
                            # Create a new CSV file within the output zip
                            output_file_name = os.path.splitext(csv_file_name)[0] + '.csv'
                            # basically just rewrite the whole csv line by line
                            output_zip.writestr(output_file_name, '\n'.join(','.join(line.split(';')) for line in input_file))
        
print("CSV files have been processed and saved back to their respective zip files.")


In [None]:
column_types = {
    #"datetime": "TIMESTAMP", #1
    "register_no": float, #2
    "emp_no": float, #3
    "trans_no": float, #4
    "upc": str, #5
    "description": str, #6
    "trans_type": str, #7
    "trans_subtype": str, #8
    "trans_status": str, #9
    "department": float, #10
    "quantity": float, #11
    "Scale": float, #12
    "cost": float, #13
    "unitPrice": float, #14
    "total": float, #15
    "regPrice": float, #16
    "altPrice": float, #17
    "tax": float, #18
    "taxexempt": float, #19
    "foodstamp": float, #20
    "wicable": float, #21
    "discount": float, #22
    "memDiscount": float, #23
    "discountable": float, #24
    "discounttype": float, #25
    "voided": float, #26
    "percentDiscount": float, #27
    "ItemQtty": float, #28
    "volDiscType": float, #29
    "volume": float, #30
    "VolSpecial": float, #31
    "mixMatch": float, #32
    "matched": float, #33
    "memType": bool, #34
    "staff": bool, #35
    "numflag": float, #36
    "itemstatus": float, #37
    "tenderstatus": float, #38
    "charflag": str, #39
    "varflag": float, #40
    "batchHeaderID": bool, #41
    "local": float, #42
    "organic": float, #43
    "display": bool, #44
    "receipt": float, #45
    "card_no": float, #46
    "store": float, #47
    "branch": float, #48
    "match_id": float, #49
    "trans_id": float, #50
    }

In [None]:
# The folder where our files with no header live
# Which we discovered using the task1_testing file
zip_folder = "no_headers"

# List all the zip files in the folder
zip_files = [file for file in os.listdir(zip_folder) if file.endswith(".zip")]

# Create a new directory to store the files with headers
# Getting messy again with multiple folders
output_folder = "wedge_zips"
os.makedirs(output_folder, exist_ok=True)

headers = dict()

for this_zf in zip_files:
    with ZipFile("WedgeZipOfZips/" + this_zf, 'r') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            input_file = zf.open(file_name, 'r')
            input_file = io.TextIOWrapper(input_file, encoding="utf-8")

            # Read the first line to check for the header row
            first_line = input_file.readline()

            # Check if the first line is a header row (you can customize this check)
            is_header = any(keyword in first_line for keyword in column_types.keys())

            headers[file_name] = is_header

            if not is_header:
                # The first line is not a header, so add it
                header_row = ','.join(column_types.keys()) + '\n'
                input_file.seek(0)  # Go back to the beginning of the file
                file_contents = input_file.read()
                input_file.close()

                # Create a subfolder with the same name as the original file (without extension) in "with_headers"
                subfolder_path = os.path.join(output_folder, os.path.splitext(file_name)[0])
                os.makedirs(subfolder_path, exist_ok=True)

                # Write the header and file contents into a CSV file in the subfolder
                output_file_path = os.path.join(subfolder_path, file_name)
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(header_row + file_contents)

            print(f"File: {file_name}, Has Header: {is_header}")

# Now, create a zip file for each subfolder in the "with_headers" folder
for foldername, subfolders, filenames in os.walk(output_folder):
    for subfolder in subfolders:
        subfolder_path = os.path.join(output_folder, subfolder)
        zip_file_path = os.path.join(output_folder, subfolder + '.zip')
        with ZipFile(zip_file_path, 'w') as new_zip:
            for root, dirs, files in os.walk(subfolder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    new_zip.write(file_path, os.path.relpath(file_path, subfolder_path))

# Clean up the subfolders in "with_headers" after creating the zip files
for subfolder in os.listdir(output_folder):
    subfolder_path = os.path.join(output_folder, subfolder)
    if os.path.isdir(subfolder_path):
        for file in os.listdir(subfolder_path):
            file_path = os.path.join(subfolder_path, file)
            os.remove(file_path)
        os.rmdir(subfolder_path)

In [2]:
# JSON key path
service_path = "C:/Users/vanes/OneDrive/Desktop/Work/MSBA/ADA/wedge_project/"
service_file = 'wedge-project-vw-key.json'  

# Gotta be credentialed
credentials = service_account.Credentials.from_service_account_file(service_path + service_file)

# The pieces needed for the GBQ upload so it goes to the right place
project_id = 'wedge-project-vw'
dataset_id = 'wedge_data2'

# And finally we establish our connection
client = bigquery.Client(credentials = credentials, project=project_id)

In [6]:
# Define the full schema for the table as a list of dictionaries
schema = [
    {"name": "datetime", "type": "TIMESTAMP"}, #1
    {"name": "register_no", "type": "FLOAT"}, #2
    {"name": "emp_no", "type": "FLOAT"}, #3
    {"name": "trans_no", "type": "FLOAT"}, #4 
    {"name": "upc", "type": "STRING"}, #5
    {"name": "description", "type": "STRING"}, #6
    {"name": "trans_type", "type": "STRING"}, #7 
    {"name": "trans_subtype", "type": "STRING"}, #8 
    {"name": "trans_status", "type": "STRING"}, #9 
    {"name": "department", "type": "FLOAT"}, #10 
    {"name": "quantity", "type": "FLOAT"}, #11
    {"name": "Scale", "type": "FLOAT"}, # 12
    {"name": "cost", "type": "FLOAT"}, # 13
    {"name": "unitPrice", "type": "FLOAT"}, #14
    {"name": "total", "type": "FLOAT"}, #15
    {"name": "regPrice", "type": "FLOAT"}, #16
    {"name": "altPrice", "type": "FLOAT"}, # 17
    {"name": "tax", "type": "FLOAT"}, #18
    {"name": "taxexempt", "type": "FLOAT"}, #19
    {"name": "foodstamp", "type": "FLOAT"}, #20
    {"name": "wicable", "type": "FLOAT"}, #21
    {"name": "discount", "type": "FLOAT"}, #22
    {"name": "memDiscount", "type": "FLOAT"}, #23
    {"name": "discountable", "type": "FLOAT"}, #24
    {"name": "discounttype", "type": "FLOAT"}, #25
    {"name": "voided", "type": "FLOAT"}, #26
    {"name": "percentDiscount", "type": "FLOAT"}, #27
    {"name": "ItemQtty", "type": "FLOAT"}, #28
    {"name": "volDiscType", "type": "FLOAT"}, #29
    {"name": "volume", "type": "FLOAT"}, #30
    {"name": "VolSpecial", "type": "FLOAT"}, #31
    {"name": "mixMatch", "type": "FLOAT"}, #32
    {"name": "matched", "type": "FLOAT"}, #33
    {"name": "memType", "type": "BOOLEAN"}, #34
    {"name": "staff", "type": "BOOLEAN"}, #35
    {"name": "numflag", "type": "FLOAT"}, #36
    {"name": "itemstatus", "type": "FLOAT"}, #37
    {"name": "tenderstatus", "type": "FLOAT"}, #38
    {"name": "charflag", "type": "STRING"}, #39
    {"name": "varflag", "type": "FLOAT"}, #40
    {"name": "batchHeaderID", "type": "BOOLEAN"}, #41
    {"name": "local", "type": "FLOAT"}, #42
    {"name": "organic", "type": "FLOAT"}, #43
    {"name": "display", "type": "BOOLEAN"}, #44
    {"name": "receipt", "type": "FLOAT"}, #45
    {"name": "card_no", "type": "FLOAT"}, #46
    {"name": "store", "type": "FLOAT"}, #47
    {"name": "branch", "type": "FLOAT"}, #48
    {"name": "match_id", "type": "FLOAT"}, #49
    {"name": "trans_id", "type": "FLOAT"} #50
]

date_columns = ["datetime"]

In [None]:
# where all of the cleaned wedge zip files are kept
folder_path = 'wedge_zips'

# Create a folder to store the extracted CSV files
output_folder = 'wedge_extracted'
os.makedirs(output_folder, exist_ok=True)

# Iterate through the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.zip'):
        # Construct the full path to the zip file
        zip_file_path = os.path.join(folder_path, filename)
        
        # Open the zip file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
            # Extract all files from the zip archive
            zip_file.extractall(output_folder)
            
            # In case more than one csv is in the zip
            extracted_files = zip_file.namelist()
            if len(extracted_files) == 1 and extracted_files[0].endswith('.csv'):
                csv_file_path = os.path.join(output_folder, extracted_files[0])
                new_csv_name = os.path.splitext(filename)[0] + '.csv'
                os.rename(csv_file_path, os.path.join(output_folder, new_csv_name))

print("CSV files extracted and saved in the 'wedge_extracted' folder.")


In [None]:
# Where all the extracted csv files live
folder_path = 'wedge_extracted'

# List all CSV files in the folder
csv_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.csv')]

# The values we want to be recognized as None when pandas reads them in
custom_na_values = [r"\\N", r"\N", "nan", None, np.NaN]

# The organic column has an empty string but its a float column 
# So these are values we want replaced with None
replacement_dict = {' ': None, '': None, '  ': None}

for file in csv_files:
    # Use the CSV filename as the table name
    table_id = os.path.basename(file).split('.')[0]  

    # Construct the BigQuery table reference
    table_ref = f"{project_id}.{dataset_id}.{table_id}"

    # Load the CSV data into the BigQuery table with the specified schema
    df = pd.read_csv(file, parse_dates=date_columns, na_values=custom_na_values, keep_default_na=True)

    column_types = {
    #"datetime": "TIMESTAMP", #1
    "register_no": float, #2
    "emp_no": float, #3
    "trans_no": float, #4
    "upc": str, #5
    "description": str, #6
    "trans_type": str, #7
    "trans_subtype": str, #8
    "trans_status": str, #9
    "department": float, #10
    "quantity": float, #11
    "Scale": float, #12
    "cost": float, #13
    "unitPrice": float, #14
    "total": float, #15
    "regPrice": float, #16
    "altPrice": float, #17
    "tax": float, #18
    "taxexempt": float, #19
    "foodstamp": float, #20
    "wicable": float, #21
    "discount": float, #22
    "memDiscount": float, #23
    "discountable": float, #24
    "discounttype": float, #25
    "voided": float, #26
    "percentDiscount": float, #27
    "ItemQtty": float, #28
    "volDiscType": float, #29
    "volume": float, #30
    "VolSpecial": float, #31
    "mixMatch": float, #32
    "matched": float, #33
    "memType": bool, #34
    "staff": bool, #35
    "numflag": float, #36
    "itemstatus": float, #37
    "tenderstatus": float, #38
    "charflag": str, #39
    "varflag": float, #40
    "batchHeaderID": bool, #41
    "local": float, #42
    "organic": float, #43
    "display": bool, #44
    "receipt": float, #45
    "card_no": float, #46
    "store": float, #47
    "branch": float, #48
    "match_id": float, #49
    "trans_id": float, #50
    }

    # Use the astype() method to change the data types of specific columns
    df['organic'].replace(replacement_dict, inplace=True)
    df = df.astype(column_types)
    df.replace("nan", None, inplace=True)

    # Replace values in the 'organic' column using the dictionary
    

    pandas_gbq.to_gbq(df, destination_table=table_ref, project_id=project_id, if_exists="replace" , table_schema=schema
    )

    print(f"File {file} uploaded to BigQuery table {table_id} in dataset {dataset_id} with the specified schema.")

In [4]:
# Forgot to upload the department lookup table
df = pd.read_csv("dept_lookup.csv")

# Get it into wedge_data2 on GBQ
pandas_gbq.to_gbq(df, destination_table="wedge-project-vw.wedge_data2.dept_lookup", project_id=project_id, if_exists="replace" 
    )

print(f"File uploaded to BigQuery table  with the specified schema.")