In [5]:
import pandas as pd # Import pandas for data manipulation and DataFrame operations
import glob # Import glob for finding files matching a pattern (e.g., all CSVs in a directory)
import os # Import os for operating system-related functionalities (e.g., path manipulation, directory creation)

In [6]:
# --- Configuration ---
# Define the input path to the directory containing your raw CSV files extracted by the THE scraper.
# This path is relative to where this script (the wrangling script) is run.
# Assumed structure: .../QS_world_rankings/THE_rankings_raw_data/
CSV_INPUT_FOLDER = 'THE_rankings_raw_data' # This should be the folder where your THE scraper saves its output.

# Define the output folder for the combined and processed CSVs for THE rankings.
# This folder will be created within the same directory as this script.
COMBINED_OUTPUT_FOLDER = 'Combined_THE_Rankings'

# Create the output folder if it doesn't exist.
if not os.path.exists(COMBINED_OUTPUT_FOLDER):
    os.makedirs(COMBINED_OUTPUT_FOLDER)
    print(f"Created output directory for combined THE CSVs: {COMBINED_OUTPUT_FOLDER}")

# --- File Discovery ---
# Use glob to find all files ending with '.csv' within the specified input folder.
# os.path.join ensures cross-platform compatibility for path construction.
all_files = glob.glob(os.path.join(CSV_INPUT_FOLDER, "*.csv"))
print(f"Found CSV files: {all_files}")

# Use a dictionary to store DataFrames, organized by year.
# This structure facilitates combining all subject-specific CSVs for a given year.
dfs_by_year = {}

# --- Process Each CSV File ---
# Iterate through each found CSV file to read, clean, and standardize its data.
for filepath in all_files:
    try:
        # Extract only the filename from the full path (e.g., 'arts-and-humanities_rankings_2025.csv').
        filename = os.path.basename(filepath)
        print(f"\nProcessing file: {filename}")

        # Split the filename by underscores to parse out subject and year components.
        # Example: 'arts-and-humanities_rankings_2025.csv' -> ['arts-and-humanities', 'rankings', '2025.csv']
        parts = filename.split('_')
        print(f"Filename parts: {parts}")

        # Extract the subject by joining all parts except the last two ('rankings' and 'year.csv').
        # This handles subject names that might contain underscores (e.g., 'life_sciences').
        subject = '_'.join(parts[:-2])
        # Extract the year from the last part of the filename, removing the '.csv' extension.
        year = parts[-1].split('.')[0]

        # Read the CSV file into a Pandas DataFrame.
        df = pd.read_csv(filepath)
        print(f"DataFrame loaded. Original columns: {df.columns.tolist()}")

        # --- Add New Features ---
        # Add 'Year' and 'Subject' columns to the DataFrame based on filename parsing.
        df['Year'] = year
        df['Subject'] = subject

        # --- Grouping DataFrames by Year ---
        # Store the processed DataFrame in the 'dfs_by_year' dictionary, grouped by its 'year'.
        if year not in dfs_by_year:
            dfs_by_year[year] = [] # Initialize a list for the year if it's new
        dfs_by_year[year].append(df) # Add the processed DataFrame to the list for its respective year

    except ValueError as e:
        # Catch ValueError, which might occur if filename parsing fails due to an unexpected format.
        print(f"Skipping file with unexpected name or format: {filepath}. Error: {e}")
        continue # Continue processing the next file in the list

# --- Combine and Save DataFrames by Year ---
# After processing all individual CSVs, this section combines them into a single DataFrame for each year.
# This creates a consolidated dataset per year, ready for further analysis.
print("\nCombining and saving data by year...")
for year, dfs in dfs_by_year.items():
    # Concatenate all DataFrames for the current year into a single DataFrame.
    # ignore_index=True resets the index of the combined DataFrame, preventing duplicate indices.
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Save the combined DataFrame for the year to a new CSV file.
    # The output file is placed in the 'COMBINED_OUTPUT_FOLDER' (e.g., './Combined_THE_Rankings/').
    # index=False prevents pandas from writing the DataFrame index as a column in the CSV.
    output_filename = f'THE_Rankings_Combined_{year}.csv'
    output_filepath = os.path.join(COMBINED_OUTPUT_FOLDER, output_filename)
    combined_df.to_csv(output_filepath, index=False)
    print(f"Saved combined data for {year} to: {output_filepath}")

print("\nData wrangling complete. Combined CSVs are in the 'Combined_THE_Rankings' folder.")

['C:/Users/d49075vs/OneDrive - The University of Manchester/Documents/Python Projects/World Rankings/THE Rankings\\Arts_Humanities_rankings_2020.csv', 'C:/Users/d49075vs/OneDrive - The University of Manchester/Documents/Python Projects/World Rankings/THE Rankings\\Arts_Humanities_rankings_2021.csv', 'C:/Users/d49075vs/OneDrive - The University of Manchester/Documents/Python Projects/World Rankings/THE Rankings\\Arts_Humanities_rankings_2022.csv', 'C:/Users/d49075vs/OneDrive - The University of Manchester/Documents/Python Projects/World Rankings/THE Rankings\\Arts_Humanities_rankings_2023.csv', 'C:/Users/d49075vs/OneDrive - The University of Manchester/Documents/Python Projects/World Rankings/THE Rankings\\Arts_Humanities_rankings_2024.csv', 'C:/Users/d49075vs/OneDrive - The University of Manchester/Documents/Python Projects/World Rankings/THE Rankings\\Arts_Humanities_rankings_2025.csv', 'C:/Users/d49075vs/OneDrive - The University of Manchester/Documents/Python Projects/World Rankings