In [1]:


import pandas as pd
import gdown
import psycopg2
import boto3
import logging
from io import StringIO
from datetime import datetime

# Create a timestamp-based filename for the log file
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_filename = f"/home/ubuntu/DMML_project/Logs/Ingestion_To_Raw/ingestion_{timestamp}.log"

# Set up logging
logging.basicConfig(filename=log_filename, level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s')

# Google Drive file ID
file_id = '1XHN3wvuqvq-zrH_jr3YWC-RR4flXKIW7'

# Download CSV file from Google Drive
url = f"https://drive.google.com/uc?id={file_id}"
try:
    gdown.download(url, 'customer_churn.csv', quiet=False)
    logging.info(f"File from Google Drive saved to customer_churn.csv")
except Exception as e:
    logging.error(f"Error downloading file from Google Drive: {e}")

# Read CSV file
try:
    df_gdrive = pd.read_csv('customer_churn.csv')
    logging.info(f"Data from Google Drive loaded into DataFrame")
    logging.info("First 5 rows of Google Drive data:")
    logging.info(df_gdrive.head(5).to_string())
except Exception as e:
    logging.error(f"Error loading data from Google Drive: {e}")

# Postgres RDS connection details
host = 'database-dmml.cluster-czyuk8c4op6k.eu-north-1.rds.amazonaws.com'
port = 5432
database = 'postgres'
username = 'postgres'
password = 'dmml-project-postgres'
schema = 'public'

# Connect to Postgres RDS
try:
    conn = psycopg2.connect(
        host=host,
        port=port,
        database=database,
        user=username,
        password=password
    )
    logging.info(f"Connected to Postgres RDS")
except psycopg2.Error as e:
    logging.error(f"Error connecting to Postgres RDS: {e}")

# Fetch data from Postgres RDS
try:
    cur = conn.cursor()
    query = f"SELECT * FROM {schema}.customer_churn_db"
    cur.execute(query)
    rows = cur.fetchall()
    columns = [desc[0] for desc in cur.description]
    df_postgres = pd.DataFrame(rows, columns=columns)
    logging.info(f"Data from Postgres RDS loaded into DataFrame")
    logging.info("First 5 rows of Postgres RDS data:")
    logging.info(df_postgres.head(5).to_string())
except psycopg2.Error as e:
    logging.error(f"Error fetching data from Postgres RDS: {e}")

# Close Postgres RDS connection
if conn is not None:
    conn.close()
    logging.info(f"Postgres RDS connection closed")

# S3 credentials
ACCESS_KEY = 'AKIAWPPO6VXLYSOLWFE7'
SECRET_KEY = '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'
BUCKET_NAME = 'dmml-storage-bits'
REGION = 'eu-north-1'

# Create S3 client
s3 = boto3.client('s3', aws_access_key_id=ACCESS_KEY,
                      aws_secret_access_key=SECRET_KEY,
                      region_name=REGION)

# Upload DataFrames to S3
try:
    # Google Drive data
    s3_file = f"raw-data/source=google_drive/type=customer_churn/timestamp={pd.Timestamp.now().strftime('%Y-%m-%d')}/customer_churn.csv"
    csv_buffer = StringIO()
    df_gdrive.to_csv(csv_buffer, index=False)
    s3.put_object(Body=csv_buffer.getvalue(), Bucket=BUCKET_NAME, Key=s3_file)
    logging.info(f"Google Drive data uploaded to S3: {s3_file}")

    # Postgres RDS data
    s3_file = f"raw-data/source=postgres_rds/type=customer_churn/timestamp={pd.Timestamp.now().strftime('%Y-%m-%d')}/customer_churn.csv"
    csv_buffer = StringIO()
    df_postgres.to_csv(csv_buffer, index=False)
    s3.put_object(Body=csv_buffer.getvalue(), Bucket=BUCKET_NAME, Key=s3_file)
    logging.info(f"Postgres RDS data uploaded to S3: {s3_file}")
except Exception as e:
    logging.error(f"Error uploading data to S3: {e}")



Downloading...
From: https://drive.google.com/uc?id=1XHN3wvuqvq-zrH_jr3YWC-RR4flXKIW7
To: /home/ubuntu/DMML_project/customer_churn.csv
100%|██████████| 1.40M/1.40M [00:00<00:00, 20.3MB/s]
