# Dataframe Creator
This notebook simply reads in raw text data and creates a pandas dataframe, saved as a .csv, to be used in other applications

In [4]:
import pandas as pd, os, gc as gc
from tqdm import tqdm_notebook as tqdm

In [5]:
# Function to build a dataframe from a given basepath
def build_df(path, basepath, df, classification):
    # Set the directory to scan equal to the path
    directory = os.fsencode(path)
    
    # Set progress bar
    tqdm().pandas()

    # Loop through all files in the directory
    for file in tqdm(os.listdir(directory)):
        # Assume a base utf-8 encoding
        temp_id = str(file.decode('utf-8'))
        # Open each file
        with open(basepath + temp_id, 'r', encoding="utf8") as curr:
            # Copy the text from the review
            temp_text = curr.read().replace('\n', '')
        # Build a new row for the dataframe
        data = [temp_id, temp_text, classification]
        # Add the row to the dataframe
        df.loc[len(df)] = data
        
        # Collect unused memory
        gc.collect()

    # Alert user to finish
    print("Done")
    
    # Return the completed dataframe
    return df

In [None]:
# Build the initial dataset to toy with
neg_path = 'D:\\Git\\Bayesian-Research\\movie_data\\neg'
pos_path = 'D:\\Git\\Bayesian-Research\\movie_data\\pos'
neg_basepath = "D:\\Git\\Bayesian-Research\\movie_data\\neg\\"
pos_basepath = "D:\\Git\\Bayesian-Research\\movie_data\\pos\\"

df = pd.DataFrame(columns = ['ID', 'Text', 'Class'])

df = build_df(neg_path, neg_basepath, df, -1)
df = build_df(pos_path, pos_basepath, df, 1)

gc.collect()
df.to_csv('D:\\Git\\Bayesian-Research\\complete_movie_data.csv')

gc.collect()

Done
