In [1]:
!pip install pandas pymongo



In [3]:
import os
import pandas as pd
from pymongo import MongoClient

In [4]:
client = MongoClient(port=27017)
db = client["bank_project"]
collection = db["economic_indicators"]

In [5]:
# Path to the folder 
excel_folder_path = "GemDataEXTR"  

In [9]:
# Function to check if file is relevant
def is_relevant_file(filename):
    return ('GDP' in filename.upper() or 'CPI' in filename.upper()) and filename.endswith('.xlsx')


In [11]:
# Clear existing collection to start fresh
collection.delete_many({})

DeleteResult({'n': 0, 'ok': 1.0}, acknowledged=True)

In [13]:
# Process files with GDP or CPI in their names
processed_files = []
for file_name in os.listdir(excel_folder_path):
    if is_relevant_file(file_name):
        file_path = os.path.join(excel_folder_path, file_name)
        print(f"Processing file: {file_name}")
        
        try:
            # Load the Excel file
            df = pd.read_excel(file_path)
            
            # Add metadata to each record
            data = df.to_dict(orient="records")
            for record in data:
                record['indicator_type'] = 'GDP' if 'GDP' in file_name.upper() else 'CPI'
                record['file_source'] = file_name
                record['seasonally_adjusted'] = 'seas. adj.' in file_name.lower()
            
            # Insert into MongoDB
            collection.insert_many(data)
            processed_files.append(file_name)
            print(f"Successfully inserted {len(data)} records from {file_name}")
            
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")

print("\nProcessed Files:")
for file in processed_files:
    print(f"- {file}")

Processing file: Core CPI, not seas. adj..xlsx
Successfully inserted 31 records from Core CPI, not seas. adj..xlsx
Processing file: Core CPI, seas. adj..xlsx
Successfully inserted 31 records from Core CPI, seas. adj..xlsx
Processing file: CPI Price, % y-o-y, median weighted, seas. adj..xlsx
Successfully inserted 31 records from CPI Price, % y-o-y, median weighted, seas. adj..xlsx
Processing file: CPI Price, % y-o-y, nominal, seas. adj..xlsx
Successfully inserted 31 records from CPI Price, % y-o-y, nominal, seas. adj..xlsx
Processing file: CPI Price, nominal, not seas. adj..xlsx
Successfully inserted 31 records from CPI Price, nominal, not seas. adj..xlsx
Processing file: CPI Price, nominal, seas. adj..xlsx
Successfully inserted 31 records from CPI Price, nominal, seas. adj..xlsx
Processing file: GDP at market prices, constant 2010 LCU, millions, seas. adj..xlsx
Successfully inserted 31 records from GDP at market prices, constant 2010 LCU, millions, seas. adj..xlsx
Processing file: GDP 

In [15]:
# Display sample of the data
print("\nSample records from database:")
for record in collection.find().limit(2):
    print(record)


Sample records from database:
{'_id': ObjectId('67551105babcc9a9ae100496'), 'Unnamed: 0': nan, 'Albania': nan, 'Armenia': nan, 'Belgium': nan, 'Belarus': nan, 'Brazil': nan, 'Canada': nan, 'Switzerland': nan, 'Chile': nan, 'China': nan, 'Cameroon': nan, 'Colombia': nan, 'Costa Rica': nan, 'Cyprus': nan, 'Czech Republic': nan, 'Germany': nan, 'Denmark': nan, 'Dominican Republic': nan, 'Ecuador': nan, 'Egypt, Arab Rep.': nan, 'Spain': nan, 'Fiji': nan, 'France': nan, 'United Kingdom': nan, 'Georgia': nan, 'Greece': nan, 'Guatemala': nan, 'Hong Kong SAR, China': nan, 'Honduras': nan, 'Croatia': nan, 'Hungary': nan, 'Indonesia': nan, 'India': nan, 'Ireland': nan, 'Iraq': nan, 'Iceland': nan, 'Israel': nan, 'Italy': nan, 'Jordan': nan, 'Japan': nan, 'Kyrgyz Republic': nan, 'Korea, Rep.': nan, 'Kuwait': nan, 'Lao, PDR': nan, 'Sri Lanka': nan, 'Lithuania': nan, 'Luxembourg': nan, 'Latvia': nan, 'Morocco': nan, 'Moldova, Rep.': nan, 'Mexico': nan, 'North Macedonia': nan, 'Malta': nan, 'Maurit

In [21]:
def explore_data():
    # Count records by indicator type
    gdp_count = collection.count_documents({"indicator_type": "GDP"})
    cpi_count = collection.count_documents({"indicator_type": "CPI"})
    
    print(f"Total GDP records: {gdp_count}")
    print(f"Total CPI records: {cpi_count}")
    
    # Get list of all countries
    sample_record = collection.find_one({"indicator_type": "GDP"})
    countries = [key for key in sample_record.keys() 
                if key not in ['_id', 'Unnamed: 0', 'indicator_type', 'file_source', 'seasonally_adjusted']]
    
    print(f"\nNumber of countries in dataset: {len(countries)}")
    print("\nSample countries:", countries[:5])
    
    # Check date range - Fixed version
    years_data = list(collection.distinct("Unnamed: 0"))
    # Convert years to numeric values, filtering out any non-numeric entries
    years = [float(year) for year in years_data if str(year).replace('.', '').isdigit()]
    
    if years:  # Make sure we have valid years
        min_year = min(years)
        max_year = max(years)
        print(f"\nDate range: {int(min_year)} to {int(max_year)}")
    else:
        print("\nNo valid years found in the dataset")

explore_data()

Total GDP records: 245
Total CPI records: 186

Number of countries in dataset: 91

Sample countries: ['Albania', 'United Arab Emirates', 'Argentina', 'Australia', 'Austria']

Date range: 1995 to 2024
