In [8]:
import boto3
from datetime import datetime 
import pandas as pd
import os
import io
from pymongo import MongoClient
from user_agents import parse
from apscheduler.schedulers.blocking import BlockingScheduler



def loop():   
    year=datetime.now().year
    month=datetime.now().month
    day=datetime.now().day
    hour=datetime.now().hour
    path = f"year={year:04d}/month={month:02d}/day={day:02d}/hour=15/"
    prefix = path
    dataframe_collector(prefix)
    print("1")

def dataframe_collector(prefix):
    all_csv_files = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    collect_dataframes = []
    for csv_file in all_csv_files['Contents']:
        if csv_file['Key'].endswith('.csv'):
            csv_file_data = s3.get_object(Bucket=bucket, Key=csv_file['Key'])['Body'].read().decode('utf-8')
            tmp_df = pd.read_csv(io.StringIO(csv_file_data))
            collect_dataframes.append(tmp_df)
    final_dataframe = pd.concat(collect_dataframes, ignore_index=True)
    parser_function(final_dataframe)
    print("2")


def parser_function(dataframe):
    dataframe['browser'] = dataframe['user_agent'].apply(lambda x: parse(x).browser.family)
    dataframe['device'] = dataframe['user_agent'].apply(lambda x: parse(x).device.family)
    transformer(dataframe)
    print("3")


def transformer(dataframe):
    dataframe=dataframe.drop(['user_agent','Unnamed: 0'],axis=1)
    dataframe['page_views'] = 1
    dataframe = dataframe.rename(columns={'user_cookie': 'unique_user'})
    dataframe = dataframe.groupby(['site', 'device', 'browser']).agg({'page_views': 'sum', 'unique_user': 'nunique'})
    store_records(dataframe)
    print("4")


def store_records(dataframe):
    data = dataframe.to_dict('records')
    collection.insert_many(data)
    print("5")


if __name__=="__main__":
    bucket = "test-bucket"
    s3 = boto3.client(
    "s3",
    endpoint_url="http://localhost:9000",
    aws_access_key_id="usmanadmin",
    aws_secret_access_key="usmanadmin",
    )
    client = MongoClient("mongodb://localhost:27017/")
    db = client["UserActivityDatabase"]
    collection = db["userstraffic"]
    scheduler = BlockingScheduler()
    scheduler.add_job(loop, 'interval', minutes=1)
    scheduler.start()


5
4
3
2
1


KeyboardInterrupt: 

In [9]:
cursor = db.userstraffic.find({})
for document in cursor:
    print(document)

{'_id': ObjectId('63f79cb2a8a7e17e1d362ca6'), 'page_views': 9507, 'user_cookie': 7380}
{'_id': ObjectId('63f79cb2a8a7e17e1d362ca7'), 'page_views': 9632, 'user_cookie': 7321}
{'_id': ObjectId('63f79cb2a8a7e17e1d362ca8'), 'page_views': 9577, 'user_cookie': 7375}
{'_id': ObjectId('63f79cb2a8a7e17e1d362ca9'), 'page_views': 9677, 'user_cookie': 7421}
{'_id': ObjectId('63f79cb2a8a7e17e1d362caa'), 'page_views': 9546, 'user_cookie': 7332}
{'_id': ObjectId('63f79cb2a8a7e17e1d362cab'), 'page_views': 9626, 'user_cookie': 7361}
{'_id': ObjectId('63f79cb2a8a7e17e1d362cac'), 'page_views': 9508, 'user_cookie': 7285}
{'_id': ObjectId('63f79cb2a8a7e17e1d362cad'), 'page_views': 9737, 'user_cookie': 7507}
{'_id': ObjectId('63f79cb2a8a7e17e1d362cae'), 'page_views': 9772, 'user_cookie': 7477}
{'_id': ObjectId('63f79cb2a8a7e17e1d362caf'), 'page_views': 19134, 'user_cookie': 11607}
{'_id': ObjectId('63f79cb2a8a7e17e1d362cb0'), 'page_views': 9529, 'user_cookie': 7308}
{'_id': ObjectId('63f79cb2a8a7e17e1d362cb