# Data Transfer between Pandas Dataframes, local files and AWS s3
Author: Yuan Huang

This section includes some routine functions to transfer data from pandas dataframes or local files to AWS s3, and transfer data from s3 to pandas dataframe. I used the gzip to compress the data in pandas dataframe before data transfer, and uncompress the data when reading data from s3 to dataframes. For local file transfer to s3 implemented in file_to_s3() function, the file was directly transferred to s3.

In [257]:
import pandas as pd
import os
from io import BytesIO
from io import StringIO
import boto3
import gzip


def df_to_s3(session,df, bucket_name, s3_path,file_key):
    """
    This function transfer the data from pandas dataframe to AWS s3
    
    Inputs:
      session:     An AWS session object
      df:          A pandas dataframe containing the data for transfer
      bucket_name: s3 bucket name
      s3_path:     s3 path
      file_key:    file name on s3
    Output:
      None. The data is saved in the designated s3 bucket, s3 path and file name
    """
    
    # write the dataframe data to a stringIO
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)
    
    # write the stringIO to BytesIO using gzip
    csv_buffer.seek(0)
    gz_buffer=io.BytesIO()
    
    with gzip.GzipFile(mode='w',fileobj=gz_buffer) as gz_file:
        gz_file.write(bytes(csv_buffer.getvalue(),'utf-8'))
        
    # initialize s3 resouce object and save the results from BytesIO to designated
    # s3 location
    s3_resource = session.resource('s3')
    s3_resource.Object(bucket_name, s3_path+file_key).put(Body=gz_buffer.getvalue())
    
def s3_to_df(session,bucket_name,key,header=None):
    """
     This function transfer the data from AWS s3 to pandas dataframe 
    
    Inputs:
      session:     An AWS session object
      bucket_name: s3 bucket name
      key:         file name on s3
      header:      if s3 file has header (None as default, or 'infer') 
    Output:
      A pandas dataframe that contains data in the designated s3 location
    """
    s3_resource=session.resource('s3')
    gz=gzip.GzipFile(fileobj=s3_resource.Object(bucket_name, key).get()['Body'])
    return pd.read_csv(gz,header=header)

def file_to_s3(session,bucket,key,localfile):
    """
     This function transfer the data from a local file to pandas dataframe 
    
    Inputs:
      session:     An AWS session object
      bucket_name: s3 bucket name
      key:         file name on s3
      localfile:   location and name of the local file for transfer 
    Output:
      None. The local file is transferred to the designated s3 location
    """
    s3_resource = session.resource('s3')
    s3_resource.Object(bucket, key).put(Body=open(localfile, 'rb'))
        

In [None]:
s3_to_df(session,bucket_name,s3_path+"test_gz1.csv","infer")

In [258]:
file_to_s3(session,bucket_name,s3_path+"transfer_file","four_hour_June.csv")