### Imports

In [1]:
import pandas as pd
import numpy as np
import os

### List Files to Import

In [26]:
# Import current list of files and their cleaning status
df_scraped_data = pd.read_csv('./Data/tweet_ids/scraped_files.csv')

In [27]:
# Import names of all files currently in /Data/tweet_ids directory, then remove the scraped_files.csv and desktop.ini
data_files = os.listdir('./Data/tweet_ids')
data_files.remove('scraped_files.csv')
data_files.remove('desktop.ini')

# Filter list of files to new files only
new_files = list(filter(lambda file: file not in df_scraped_data['file'].values , data_files))

In [28]:
# Create dataframe of new files. Set added status as 0
df_new_files = pd.DataFrame([new_files,[0]*len(new_files), [0]*len(new_files)], index = ['file','stripped','combined']).T

# Append new files dataframe to existing dataframe
df_scraped_data = df_scraped_data.append(df_new_files).reset_index(drop = True)

### Prepare Tweet ID Files for Hydration

In [30]:
# Loop through all files that have not yet been stripped of sentiment scores
for file in df_scraped_data.loc[df_scraped_data['stripped'] == 0, 'file']:
    # Read the file, keep the first column
    df_file = pd.read_csv(f'./Data/tweet_ids/{file}', header=None)
    df_file = df_file[0]
    
    # Write the file back over the original
    df_file.to_csv(f'./Data/tweet_ids/{file}', index=False, header=None)
    
    # Update to reflect data has been stripped
    df_scraped_data.loc[df_scraped_data['file'] == file, 'stripped'] == 1

### Combine CSVs for Hydration

In [38]:
# Loop through all files that have not yet been stripped of sentiment scores
for month in ['march','april','may','june','july','august']:
    # Get all files beginning with a certain month
    month_files = df_scraped_data.loc[df_scraped_data['file'].str.startswith(month),'file']
    
    # Combine all of the files for that month into a single dataframe
    df_combined = pd.concat([pd.read_csv(f'./Data/tweet_ids/{file}', header=None) for file in month_files])
    
    # Write the file back over the original
    df_combined.to_csv(f'./Data/tweet_ids/tweet_ids_{month}.csv', index=False, header=None)
    
    # Set combined column to 1 for all files in given month
    df_scraped_data.loc[df_scraped_data['file'].str.startswith(month),'combined'] = 1

### Output File

In [46]:
# Update reference file for scraped data
df_scraped_data.to_csv('./Data/tweet_ids/scraped_files.csv', index = False)