In [None]:
import pandas as pd
import time
import os.path

In [None]:
# Each LDA unit has 2247 statistics fields so the chunksize = 1 LDA sector x number of LDAs we want to load
chunksize = 2247*20
filePath = './LDA/LDA v1.csv'
originalPath = './LDA/98-401-X2016044_English_CSV_data.csv'

In [None]:
# Helper function to save the data from RAM to Disk
def appendToFile(filepath, df):
    header = True
    # if file exists then don't append the header again
    if os.path.isfile(filePath):
        header = False
        
    df.to_csv(filePath, mode='a', header=header, index=False)

In [None]:
# Load the original census data
df = pd.read_csv(originalPath, nrows=2247)

# Gets the labels for the new data frame
labels = tuple(df.iloc[0:2247, 9].tolist())

# Delete the old census data since we got the labels out
del df

# Reads the large census file in smaller chunks
for df in pd.read_csv(originalPath, chunksize=chunksize):
    
    # Creates a new data file with the labels 
    newDf = pd.DataFrame(columns=labels)
    
    # ldaList is a list of all the unique LDA ID (DAUID) in the chunk we loaded(Total = 20 LDA)
    ldaList = df['GEO_CODE (POR)'].unique()
    
    # Creates a placeholder col
    newDf['DAUID'] = 0
    
    # Goes through each LDA sector which is 2247 rows and the chunk size was chosen to have 20 LDA
    for i in range(20):
        
        # Begining of a new LDA
        firstIndex = i*2247
        
        # Last row of an LDA
        lastIndex = (i+1) * 2247
        
        # Splices the old df into a new array
        # 12 equals the total values, 13 = Males, 14 = Females 
        totalValues = df.iloc[firstIndex:lastIndex, 12].tolist()
        
        # We attach the DAUID (LDA ID) at the end of the array so it lines up with the cols
        totalValues.append(ldaList[i])

        # We set the new DF row to equal what we just spliced
        newDf.loc[i] = totalValues
        
    
    appendToFile(filepath, newDf)
    
    # delete the newDf to free space 
    del newDf