# Combine Files

This notebook contains functions to combine multiple CSV files or multiple JSON files into a single file.

**Note:** This script assumes that all the files in the specified folder have the same column structure.

## Import Dependencies

In [None]:
import os
import pandas as pd

## Set Variables

In [None]:
input_folder = 'data/'
output_folder = 'data/'
output_filename = 'combined'

## Combine CSV Files

In [None]:
# Step 1: Loop through CSV files in a folder
csv_files = [file for file in os.listdir(input_folder) if file.endswith('.csv')]

# Step 2: Combine contents of each file into a single pandas dataframe
df_combined = pd.DataFrame()
for file in csv_files:
    file_path = os.path.join(input_folder, file)
    df = pd.read_csv(file_path)
    df_combined = pd.concat([df_combined, df], ignore_index=True)

print(len(df_combined))

# Step 3: Remove Duplicates
df_combined.drop_duplicates(ignore_index=True).reset_index(drop=True)

print(len(df_combined))

# Step 4: Export the pandas dataframe
output_file_path = f'{output_folder}{output_filename}.csv'
df_combined.to_csv(output_file_path, index=False, encoding='utf-8-sig')

## Combine JSON Files

In [None]:
# Step 1: Loop through JSON files in a folder
json_files = [file for file in os.listdir(input_folder) if file.endswith('.json')]

# Step 2: Combine contents of each file into a single pandas dataframe
df_combined = pd.DataFrame()
for file in json_files:
    file_path = os.path.join(input_folder, file)
    df = pd.read_json(file_path)
    df_combined = pd.concat([df_combined, df], ignore_index=True)

print(len(df_combined))

# Step 3: Remove Duplicates
df_combined.drop_duplicates(ignore_index=True).reset_index(drop=True)

print(len(df_combined))

# Step 3: Export the pandas dataframe
output_file_path = f'{output_folder}{output_filename}.json'
df_combined.to_json(output_file_path, orient='records', lines=False)