# ACADEMY and TXT

In [10]:
import boto3
import pandas as pd
import os
import datetime as dt
from io import BytesIO, StringIO
import os
from pathlib import Path
import json


def get_s3_objects(bucket_name, folder_path):
    s3_client = boto3.client('s3')
    objects = s3_client.list_objects(Bucket=bucket_name, Prefix=folder_path)
    keys = [obj['Key'] for obj in objects['Contents'] if obj['Key']]
    dfs = []
    for key in keys:
        obj = s3_client.get_object(Bucket=bucket_name, Key=key)
        df = pd.read_csv(obj['Body'], delimiter=',')
        df['filename'] = key.split('/')[-1]
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

s3 = boto3.client('s3')
file_type = '.txt'
prefix='Talent/'
bucket_name = 'data-eng-204-final-project'

# Use paginator to retrieve all objects in the bucket with the given prefix
paginator = s3.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)
# Download each .txt file from S3
for page in page_iterator:
    for obj in page['Contents']:
        key = obj['Key']
        if key.endswith(file_type):
            local_file = os.path.join(f'data', os.path.basename(key))
            s3.download_file(bucket_name, key, local_file)
            print(f"Downloaded S3 key {key} to local file {local_file}")


def get_course_dataframe(bucket_name, course_name):
    folder_path = f'Academy/{course_name}'
    df = get_s3_objects(bucket_name, folder_path)
    df['course'] = df['filename'].str.extract(r'^(.*?)_')
    df['started_date'] = df['filename'].str.extract(r'_(\d{4}-\d{2}-\d{2})\.csv$')
    cols = df.columns.tolist()
    cols.remove('trainer')
    cols.insert(1, 'trainer')
    cols.remove('course')
    cols.insert(2, 'course')
    cols.remove('started_date')
    cols.insert(3, 'started_date')
    df = df.reindex(columns=cols)
    return df


def get_all_courses_dataframe(bucket_name):
    df_business = get_course_dataframe(bucket_name, 'Business')
    df_data = get_course_dataframe(bucket_name, 'Data')
    df_eng = get_course_dataframe(bucket_name, 'Engineering')
    return pd.concat([df_business, df_data, df_eng], ignore_index=True)


def normalise_scores_df(bucket_name):
    all_courses = get_all_courses_dataframe(bucket_name)
    weeks = [f'W{i}' for i in range(1, 11)]
    traits = ['Analytic', 'Independent', 'Determined', 'Professional', 'Studious', 'Imaginative']
    id_vars = ['name', 'trainer', 'course', 'started_date']
    value_vars = [f'{trait}_{week}' for trait in traits for week in weeks]
    scores_df = pd.melt(all_courses, id_vars=id_vars, value_vars=value_vars, var_name='trait_week', value_name='score')
    scores_df[['trait', 'week']] = scores_df['trait_week'].str.split('_W', expand=True)
    scores_df.drop('trait_week', axis=1, inplace=True)
    return scores_df


# CLEANS THE TEXT DATAFRAMES 

def text_cleaner(file_name):
    file = open(file_name,'r')
    topology_list = file.readlines()
    formats = "%d %B %Y"

    my_dict = {
        'name': [],
        'date': [],
        'academy': [],
        'presentation': [],
        'pyschometrics': []
    }

    for i, c in enumerate(topology_list):
        if i == 0:
            date = str(c[:-1])
            date = date.replace("Monday ", "").replace("Tuesday ", '').replace("Wednesday ", '').replace("Thursday ", '').replace("Friday ", '')
            date_cleaned = dt.datetime.strptime(date, formats)
        elif i == 1:
            academy = str(c[:-9])
        elif i >= 3:
            pres_score = int(c[-6:-4])
            my_dict['presentation'].append(pres_score)
            psych_score = int(c[-28:-26])
            psych_p = psych_score
            my_dict['pyschometrics'].append(psych_score)
            name = str(c[0:-47])
            my_dict['name'].append(name)
            my_dict['academy'].append(academy)
            my_dict['date'].append(date_cleaned)
    return pd.DataFrame(my_dict)

normalised_df = normalise_scores_df(bucket_name='data-eng-204-final-project')

normalised_df['trainer'].unique()

normalised_df['trainer'] = normalised_df['trainer'].replace('Ely Kely', 'Elly Kelly')

normalised_df['name'] = normalised_df['name'].str.upper()

normalised_df['started_date'] = pd.to_datetime(normalised_df['started_date'])
normalised_df['week'] = pd.to_numeric(normalised_df['week'])

temp = []
# ITERATE THROUGH THE FILES AND PERFORM THE CLEANING (TXT)
pathlist = Path('data').rglob('*')
for path in pathlist:
     path_in_str = str(path)
     temp.append(text_cleaner(path_in_str))
    #  print(path_in_str)

df_txt = pd.concat(temp)

normalised_df.to_csv('academy_clean.csv', index=False)

df_txt.to_csv('txt_clean.csv', index=False)

Downloaded S3 key Talent/Sparta Day 1 August 2019.txt to local file data\Sparta Day 1 August 2019.txt
Downloaded S3 key Talent/Sparta Day 1 May 2019.txt to local file data\Sparta Day 1 May 2019.txt
Downloaded S3 key Talent/Sparta Day 1 October 2019.txt to local file data\Sparta Day 1 October 2019.txt
Downloaded S3 key Talent/Sparta Day 10 April 2019.txt to local file data\Sparta Day 10 April 2019.txt
Downloaded S3 key Talent/Sparta Day 10 December 2019.txt to local file data\Sparta Day 10 December 2019.txt
Downloaded S3 key Talent/Sparta Day 10 January 2019.txt to local file data\Sparta Day 10 January 2019.txt
Downloaded S3 key Talent/Sparta Day 10 July 2019.txt to local file data\Sparta Day 10 July 2019.txt
Downloaded S3 key Talent/Sparta Day 10 October 2019.txt to local file data\Sparta Day 10 October 2019.txt
Downloaded S3 key Talent/Sparta Day 10 September 2019.txt to local file data\Sparta Day 10 September 2019.txt
Downloaded S3 key Talent/Sparta Day 11 April 2019.txt to local fil

# JSON

In [11]:
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
session = boto3.session.Session()

# Boto3 clients
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')
session = boto3.session.Session()
#bucket_list = s3_client.list_buckets()


bucket_name = 'data-eng-204-final-project'
key = 'Talent/10383.json'

obj_1 = s3_client.get_object(Bucket=bucket_name, Key=key)

strbody = obj_1['Body'].read()
body = json.loads(strbody)


df = pd.DataFrame()
list_of_dict = []

paginator = s3_client.get_paginator('list_objects_v2')
results = paginator.paginate(Bucket=bucket_name)
for page in results:
    if "Contents" in page:
        for key in page[ "Contents" ]:
            keyString = key[ "Key" ]
            if '.json' in keyString:
                obj_1 = s3_client.get_object(Bucket=bucket_name, Key=keyString)
                strbody = obj_1['Body'].read()
                body = json.loads(strbody)
                list_of_dict.append(body)


df = pd.DataFrame(list_of_dict)
df.to_csv('export_dataframe.csv')

print(df)
bucket_list = s3_client.list_buckets()
bucket_name = 'data-eng-204-final-project'
key = 'Talent/10383.json'

obj_1 = s3_client.get_object(Bucket=bucket_name, Key=key)

df = pd.read_csv("export_dataframe.csv", index_col=0)
df.info()

df1 = df[df.isna().any(axis=1)]

df['tech_self_score'].fillna('N/A',inplace=True)

df[df['tech_self_score']=='N/A']

df["date"] = pd.to_datetime(df['date'], dayfirst=True)
df['name'] = df['name'].str.upper()

df.to_csv(r'clean_json.csv', index=False)

                   name        date  \
0     Stillmann Castano  22/08/2019   
1       Hilary Willmore  01/08/2019   
2         Efrem Whipple  22/08/2019   
3           Sydel Fenne  28/08/2019   
4       Michel Lebarree  07/08/2019   
...                 ...         ...   
3100    Jacky Reilingen  04/04/2019   
3101    Phillis Lyfield  10/04/2019   
3102       Celle Barlas  16/04/2019   
3103         Scott Duny  11/04/2019   
3104  Boycey Matushenko  25/04/2019   

                                        tech_self_score  \
0         {'C#': 6, 'Java': 5, 'R': 2, 'JavaScript': 2}   
1           {'Python': 1, 'C#': 4, 'Java': 2, 'C++': 4}   
2                                 {'Ruby': 4, 'C++': 4}   
3                                {'Java': 3, 'SPSS': 4}   
4     {'Python': 3, 'Java': 4, 'Ruby': 1, 'R': 2, 'P...   
...                                                 ...   
3100            {'C#': 2, 'Java': 6, 'R': 1, 'SPSS': 4}   
3101          {'C#': 4, 'Java': 4, 'Ruby': 4, 'PHP': 1}   


# TALENT INFO

In [12]:
import pandas as pd
import boto3
import os

# Set up S3 client
s3 = boto3.resource('s3')

# Set up S3 bucket and folder
bucket_name = 'data-eng-204-final-project'
folder_name = 'Talent/'

# Get list of CSV files in folder
bucket = s3.Bucket(bucket_name)
csv_files = [obj.key for obj in bucket.objects.filter(Prefix=folder_name) if obj.key.endswith('.csv')]

# Merge all CSV files into a single DataFrame
df_list = []
for csv_file in csv_files:
    obj = s3.Object(bucket_name, csv_file)
    body = obj.get()['Body']
    df = pd.read_csv(body)
    df_list.append(df)
merged_df = pd.concat(df_list)

print(f"Successfully merged and saved {len(df_list)} CSV files into 'merged_data.csv'.")

merged_df['name'] = merged_df['name'].str.upper()

merged_df.fillna(value='unknown', inplace=True)

merged_df.drop(columns=['address','uni','degree','invited_date','month','invited_by'],inplace=True)

merged_df.drop(columns=['id'],inplace=True)

merged_df.to_csv('talent_info.csv', index=False)

Successfully merged and saved 12 CSV files into 'merged_data.csv'.


In [14]:
df = pd.read_csv('talent_info.csv')
df1 = pd.read_csv('txt_clean.csv')
df2 = pd.read_csv('clean_json.csv')

df_talent = pd.merge(pd.merge(df,df1, on='name', how='outer'), df2, on='name', how='outer')

df_talent.drop('date_y', axis=1, inplace=True)

df_talent.drop_duplicates(inplace=True)

df_talent['phone_number'] = df_talent['phone_number'].str.replace(' ', '-').str.replace('(', '').str.replace(')', '')

df_talent = df_talent.rename(columns={'pyschometrics': 'psychometrics', 'date_x': 'date'})

df_talent['dob'] = df_talent['dob'].replace('unknown', None)

df_talent['dob'] = pd.to_datetime(df_talent['dob'], format='%d/%m/%Y')
df_talent['date'] = pd.to_datetime(df_talent['date'])

df_talent.to_csv('talent_combined.csv', index=False)

  df_talent['phone_number'] = df_talent['phone_number'].str.replace(' ', '-').str.replace('(', '').str.replace(')', '')


In [15]:
s3 = boto3.client('s3')

bucket_name = 'data-eng-204-final-project'
prefix = 'cleaned_data/'

# List of local files to upload
local_files = ['clean_json.csv', 'academy_clean.csv', 'talent_combined.csv', 'talent_info.csv', 'txt_clean.csv']

for file in local_files:
    # Generate S3 key based on the file name
    s3_key = prefix + file.split('/')[-1]

    # Upload the file to S3
    s3.upload_file(file, bucket_name, s3_key)

    print(f"Uploaded {file} to S3 bucket {bucket_name} with key {s3_key}")

Uploaded clean_json.csv to S3 bucket data-eng-204-final-project with key cleaned_data/clean_json.csv
Uploaded academy_clean.csv to S3 bucket data-eng-204-final-project with key cleaned_data/academy_clean.csv
Uploaded talent_combined.csv to S3 bucket data-eng-204-final-project with key cleaned_data/talent_combined.csv
Uploaded talent_info.csv to S3 bucket data-eng-204-final-project with key cleaned_data/talent_info.csv
Uploaded txt_clean.csv to S3 bucket data-eng-204-final-project with key cleaned_data/txt_clean.csv
