In [1]:
import requests
import json
import pandas as pd

# Step 1: Set the URL
url = "https://www.forbes.com/forbesapi/person/rtb/0/-estWorthPrev/true.json?fields=rank,uri,personName,lastName,gender,source,industries,countryOfCitizenship,birthDate,finalWorth,estWorthPrev,imageExists,squareImage,listUri"

# Step 2: Make a GET request to fetch the data
response = requests.get(url)

# Step 3: Check if the request was successful
if response.status_code == 200:
    data = response.json()  # Parse JSON

# Navigate to the relevant data
    person_list = data.get("personList", {}).get("personsLists", [])

    # Step 4: Process and store the data
    if person_list:
        # Convert to a Pandas DataFrame for easier manipulation
        df = pd.DataFrame(person_list)
        print(df.head())  # Display first few rows

        # Optionally save to a CSV file
        df.to_csv("forbes_data.csv", index=False)
        print("Data saved to forbes_data.csv")
    else:
        print("No person data found in the response.")
else:
    print(f"Failed to fetch data. HTTP Status Code: {response.status_code}")

               uri  rank listUri  imageExists  finalWorth  \
0        elon-musk     1     rtb         True  334260.965   
1    larry-ellison     2     rtb         True  235255.878   
2       jeff-bezos     3     rtb         True  213507.954   
3  mark-zuckerberg     4     rtb         True  193464.600   
4  bernard-arnault     5     rtb         True  155493.383   

                 personName         source          industries  \
0                 Elon Musk  Tesla, SpaceX        [Automotive]   
1             Larry Ellison         Oracle        [Technology]   
2                Jeff Bezos         Amazon        [Technology]   
3           Mark Zuckerberg       Facebook        [Technology]   
4  Bernard Arnault & family           LVMH  [Fashion & Retail]   

  countryOfCitizenship gender     birthDate    lastName  wealthList  \
0        United States      M  4.691520e+10        Musk       False   
1        United States      M -8.007552e+11     Ellison       False   
2        United States 

In [None]:
from datetime import datetime

# Convert 'birthDate' to datetime, handling potential overflow
df['birthDate'] = pd.to_datetime(df['birthDate'], unit='ms')

# Filter out rows with invalid birthDate (NaT)
df = df.dropna(subset=['birthDate'])

# Calculate age
current_date = datetime.now()
df['age'] = df['birthDate'].apply(lambda x: current_date.year - x.year - ((current_date.month, current_date.day) < (x.month, x.day)))

# Drop birthDate column
df = df.drop('birthDate', axis=1)

df = df.drop(['rank', 'finalWorth', 'source','estWorthPrev','squareImage', 'uri', 'imageExists','wealthList','listUri','lastName'], axis=1)

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote

# Function to scrape education details from Wikipedia
def scrape_education(person_name):
    base_url = "https://en.wikipedia.org/wiki/"
    url = base_url + quote(person_name.replace(" ", "_"))  # Handle spaces and special characters
    try:
        response = requests.get(url, headers={"User-Agent": "YourAppName/1.0 (your_email@example.com)"})
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            infobox = soup.find('table', {'class': 'infobox'})
            if infobox:
                for row in infobox.find_all('tr'):
                    if 'Education' in row.text:
                        return row.find('td').text.strip()
            return "Education not found"
        elif response.status_code == 404:
            return "Page not found"
        else:
            return f"Error: {response.status_code}"
    except Exception as e:
        return f"Error: {str(e)}"



# Fetch education details for the first 20 persons and add to the DataFrame
df['education'] = df['personName'].apply(scrape_education)

# Display the updated limited DataFrame
df.head()





Unnamed: 0,personName,industries,countryOfCitizenship,gender,familyList,bioSuppress,age,education
0,Elon Musk,[Automotive],United States,M,False,False,53,"University of Pennsylvania(BA, BS)"
1,Larry Ellison,[Technology],United States,M,False,False,80,"University of Illinois, Urbana-Champaign (no d..."
2,Jeff Bezos,[Technology],United States,M,False,False,60,Princeton University (BSE)
3,Mark Zuckerberg,[Technology],United States,M,False,False,40,Harvard University (dropped out)
4,Bernard Arnault & family,[Fashion & Retail],France,M,False,False,75,Page not found


In [4]:
df.head(4)

Unnamed: 0,personName,industries,countryOfCitizenship,gender,familyList,bioSuppress,age
0,Elon Musk,[Automotive],United States,M,False,False,53
1,Larry Ellison,[Technology],United States,M,False,False,80
2,Jeff Bezos,[Technology],United States,M,False,False,60
3,Mark Zuckerberg,[Technology],United States,M,False,False,40


In [None]:
import pandas as pd
import re

# Define a function to extract all matching education levels, including unfinished degrees
def extract_levels(education):
    levels_mapping = {
        r'\bBA\b|\bBS\b|\bBSE\b|bachelor': 'Bachelor',
        r'\bMA\b|\bMS\b|master': 'Master',
        r'\bPhD\b|doctorate': 'PhD',
        r'high school': 'High School',
        r'middle school': 'Middle School',
        r'elementary': 'Elementary',
        r'college dropout': 'College Dropout',
        r"high school dropout": 'High School Dropout',
        r"didn’t attend|didn't attend": 'Didn’t Attend',
        r'dropped out': 'Dropped Out',
        r'\bunfinished PhD\b|incomplete PhD': 'Unfinished PhD',
        r'\bunfinished Master\b|incomplete Master': 'Unfinished Master',
        r'\bunfinished Bachelor\b|incomplete Bachelor': 'Unfinished Bachelor',
    }

    # Find all matches and return as a semicolon-separated string
    levels = []
    for pattern, level in levels_mapping.items():
        if re.search(pattern, education, re.IGNORECASE):
            levels.append(level)
    return "; ".join(sorted(set(levels))) if levels else "Unknown"

# Define a function to extract university names (unchanged)
def extract_university(education):
    university_patterns = [
        r'(?:University of [^\n,;()]+)',
        r'(?:College of [^\n,;()]+)',
        r'(?:[A-Za-z]+ Institute)',
        r'(?:[A-Za-z]+ Academy)',
        r'(?:[A-Za-z]+ School)',
        r'(?:Urbana-Champaign)',
        r'(?:[A-Za-z]+ University)'
    ]
    matches = []
    for pattern in university_patterns:
        matches.extend(re.findall(pattern, education, re.IGNORECASE))
    return "; ".join(sorted(set(matches))) if matches else "Not specified"



# Apply the updated functions
df['level'] = df['education'].apply(extract_levels)
df['university'] = df['education'].apply(extract_university)




In [None]:
df=df.drop(['education'],axis=1)

In [None]:
df.head(7)

In [12]:
import nltk
import re
import pandas as pd
from collections import defaultdict

# Load the text file
with open('/Users/youssefabdelrazik/Downloads/wikipedia_facts.txt', 'r') as file:
    text = file.read()

# Split the text into blocks for each person
person_blocks = text.split("\n\n")  # Assuming each person's data is separated by a blank line

# Dictionary to store extracted features
data = defaultdict(dict)

for block in person_blocks:
    # Extract person's name
    person_name = re.search(r"Person: (.+)", block)
    if person_name:
        name = person_name.group(1).strip()
    else:
        continue  # Skip if no person name is found

    # Extract 'Born' details
    born_match = re.search(r"Born: (.+)", block)
    if born_match:
        data[name]['Born'] = born_match.group(1).strip()

    # Extract 'Education' or 'Alma Mater' details
    education_match = re.search(r"(Education|Alma Mater): (.+)", block)
    if education_match:
        data[name]['Education'] = education_match.group(2).strip()

# Convert the dictionary to a list of dictionaries for pandas
data_list = [{"Name": person, **features} for person, features in data.items()]

# Create a DataFrame
bbb = pd.DataFrame(data_list)

# Save the DataFrame to a CSV file
bbb.to_csv('DATAA.csv', index=False)

# Display the DataFrame
bbb.head(6)


Unnamed: 0,Name,Born,Education
0,Elon Musk,"Elon Reeve Musk ( 1971-06-28 ) June 28, 1971 (...","University of Pennsylvania ( BA , BS )"
1,Larry Ellison,Lawrence Joseph Ellison ( 1944-08-17 ) August ...,"University of Illinois, Urbana-Champaign (no d..."
2,Jeff Bezos,Jeffrey Preston Jorgensen ( 1964-01-12 ) Janua...,Princeton University ( BSE )
3,Mark Zuckerberg,"Mark Elliot Zuckerberg ( 1984-05-14 ) May 14, ...",Harvard University (dropped out)
4,Warren Buffett,Warren Edward Buffett ( 1930-08-30 ) August 30...,University of Pennsylvania University of Nebra...
5,Larry Page,"Lawrence Edward Page ( 1973-03-26 ) March 26, ...",University of Michigan ( BSE ) Stanford Univer...


In [51]:
import re
import pandas as pd
from collections import defaultdict

# Load the text file
with open('/Users/youssefabdelrazik/Downloads/wikipedia_facts.txt', 'r') as file:
    text = file.read()

# Split the text into blocks for each person
person_blocks = text.split("\n\n")  # Assuming each person's data is separated by a blank line

# Dictionary to store extracted features
data = defaultdict(dict)

for block in person_blocks:
    # Extract person's name
    person_name = re.search(r"Person: (.+)", block)
    if person_name:
        name = person_name.group(1).strip()
    else:
        continue  # Skip if no person name is found

    # Extract 'Born' details
    born_match = re.search(r"Born: (.+)", block)
    if born_match:
        data[name]['Born'] = born_match.group(1).strip()

    # Extract 'Alma Mater' details
    alma_mater_match = re.search(r"Alma\s?mater: (.+)", block, re.IGNORECASE)
    if alma_mater_match:
        data[name]['Education'] = alma_mater_match.group(1).strip()



data_list = [{"Name": person, **features} for person, features in data.items()]

# Create a DataFrame
bbb3 = pd.DataFrame(data_list)



# Display the DataFrame
bbb3.head(5)

# Save to a CSV file
bbb3.to_csv('DATA5.csv', index=True)


In [None]:
merged_df4 = pd.merge(df1, df2, on='Name', how='outer', suffixes=('_df1', '_df2'))

# Combine 'Education' columns, prioritizing non-null values
merged_df4['Education'] = merged_df4['Education_df1'].combine_first(merged_df4['Education_df2'])

# Drop the intermediary columns
merged_df4 = merged_df4[['Name', 'Education']]

# Display the result

In [57]:
import pandas as pd

# Example DataFrame 1
df1 = pd.read_csv('/Users/youssefabdelrazik/DATAA.csv')
# Example DataFrame 2

# Merge the two DataFrames on 'Name'


df2=pd.read_csv('/Users/youssefabdelrazik/DATA5.csv')



merged_df4 = pd.merge(df1, df2, on='Name', how='outer', suffixes=('_df1', '_df2'))

# Combine 'Education' columns, prioritizing non-null values
merged_df4['Education'] = merged_df4['Education_df1'].combine_first(merged_df4['Education_df2'])

# Drop the intermediary columns
merged_df4 = merged_df4[['Name', 'Education']]






In [78]:
merged_df4.head(4)


Unnamed: 0,Name,Education
0,A. Jayson Adair,
1,Abigail Johnson,William Smith College ( BA ) Harvard Universit...
2,Achal Bakeri,CEPT University University of Southern California
3,Acharya Balkrishna,


In [81]:
merged_df4=merged_df4.dropna()




In [82]:
merged_df4.head(5)

Unnamed: 0,Name,Education
1,Abigail Johnson,William Smith College ( BA ) Harvard Universit...
2,Achal Bakeri,CEPT University University of Southern California
5,Adam Neumann,Israeli Naval Academy Baruch College
6,Adebayo Ogunlesi,"Lincoln College, Oxford Harvard Law School Har..."
7,Adi Godrej,"St. Xavier's College, Mumbai MIT Sloan School ..."


In [83]:
merged_df4.to_csv("tetty.csv")

In [66]:
df=pd.read_csv('/Users/youssefabdelrazik/forbes_data.csv')
from datetime import datetime
# Convert 'birthDate' from ms to datetime
df['birthDate'] = pd.to_datetime(df['birthDate'], unit='ms')

# Extract year from birthdate
df['Year'] = df['birthDate'].dt.year

# Function to classify the year into early or late era
def classify_era(year):
    if year >= 1837:  # Only classify years starting from 1837
        decade = int(year // 10 * 10)  # Find the decade
        if int(year % 10) <= 4:
            return f"Early {decade}s"
        else:
            return f"Late {decade}s"
    else:
        return "Page not found"

# Apply the era classification function
df['Era'] = df['Year'].apply(classify_era)

# Drop the 'Year' column if not needed
df.drop(columns=['Year'], inplace=True)
# Drop birthDate column
df = df.drop('birthDate', axis=1)

df = df.drop(['rank', 'finalWorth', 'source','estWorthPrev','squareImage', 'uri', 'imageExists','wealthList','listUri','lastName'], axis=1)

In [67]:

df.head(3)


Unnamed: 0,personName,industries,countryOfCitizenship,gender,familyList,bioSuppress,Era
0,Elon Musk,['Automotive'],United States,M,False,False,Early 1970s
1,Larry Ellison,['Technology'],United States,M,False,False,Early 1940s
2,Jeff Bezos,['Technology'],United States,M,False,False,Early 1960s


In [68]:

df=df.drop(['familyList','bioSuppress','industries'],axis=1)

In [70]:

df.rename(columns={'personName': 'Name'}, inplace=True)
df.head(5)

Unnamed: 0,Name,countryOfCitizenship,gender,Era
0,Elon Musk,United States,M,Early 1970s
1,Larry Ellison,United States,M,Early 1940s
2,Jeff Bezos,United States,M,Early 1960s
3,Mark Zuckerberg,United States,M,Early 1980s
4,Bernard Arnault & family,France,M,Late 1940s


In [123]:
dfsemi= df.merge(merged_df4, on='Name', how='inner')
dfsemi.head(4)



KeyError: 'Name'

In [121]:
dfsemi.to_csv('mmet.csv')

In [125]:

dfsemi=dfsemi.drop(['gdppercapita'],axis=1)
dfsemi.head()

Unnamed: 0.1,Unnamed: 0,Name,countryOfCitizenship,gender,Era,Education
0,0,Elon Musk,United States,M,Early 1970s,"University of Pennsylvania ( BA , BS )"
1,1,Larry Ellison,United States,M,Early 1940s,"University of Illinois, Urbana-Champaign (no d..."
2,2,Jeff Bezos,United States,M,Early 1960s,Princeton University ( BSE )
3,3,Mark Zuckerberg,United States,M,Early 1980s,Harvard University (dropped out)
4,4,Warren Buffett,United States,M,Early 1930s,University of Pennsylvania University of Nebra...


In [152]:
dfsemi.head(3)

Unnamed: 0.1,Unnamed: 0,Name,countryOfCitizenship,gender,Era,Education
0,0,Elon Musk,United States,M,Early 1970s,"University of Pennsylvania ( BA , BS )"
1,1,Larry Ellison,United States,M,Early 1940s,"University of Illinois, Urbana-Champaign (no d..."
2,2,Jeff Bezos,United States,M,Early 1960s,Princeton University ( BSE )


In [None]:
dfsemi=pd.read_csv('/Users/youssefabdelrazik/mmet.csv')

#  Filter the rows where countryOfCitizenship is 'United States'
dfusa = dfsemi[dfsemi['countryOfCitizenship'] == 'United States']


dfusa.shape
dfera=dfusa['Era'].unique()
print(dfera)

['Early 1970s' 'Early 1940s' 'Early 1960s' 'Early 1980s' 'Early 1930s'
 'Late 1950s' 'Late 1960s' 'Late 1940s' 'Late 1930s' 'Late 1980s'
 'Late 1970s' 'Early 1950s' 'Late 1920s' 'Early 1990s' 'Late 1990s'
 'Page not found']


In [163]:
dfsemi.shape

(907, 6)

In [146]:
distinct_countries = dfsemi['countryOfCitizenship'].unique()



In [147]:
dfsemi['countryOfCitizenship'] = dfsemi['countryOfCitizenship'].replace('Eswatini (Swaziland)', 'Eswatini')



In [148]:

distinct_countries = dfsemi['countryOfCitizenship'].unique()
print(distinct_countries)

['United States' 'India' 'Canada' 'China' 'Italy' 'France' 'Hong Kong'
 'Austria' 'Switzerland' 'Brazil' 'Australia' 'Japan' 'Russia' 'Indonesia'
 'Germany' 'Philippines' 'Israel' 'Thailand' 'United Kingdom'
 'United Arab Emirates' 'Singapore' 'Taiwan' 'Denmark' 'Malaysia'
 'Nigeria' 'Spain' 'Sweden' 'Norway' 'New Zealand' 'Czech Republic'
 'South Korea' 'Argentina' 'Colombia' 'Egypt' 'Eswatini' 'Ireland'
 'Belize' 'Chile' 'Poland' 'Kazakhstan' 'Turkey' 'Monaco' 'Georgia'
 'Venezuela' 'Finland' 'Cyprus' 'Vietnam' 'South Africa' 'Greece'
 'Lebanon' 'Guernsey' 'Iceland' 'Bulgaria' 'Ukraine' 'Zimbabwe' 'Tanzania'
 'Croatia' 'Romania' 'Netherlands' 'Peru' 'St. Kitts and Nevis' 'Hungary']


In [227]:
import requests
from bs4 import BeautifulSoup

# List of countries to scrape
countries = [
    'United States', 'India', 'Canada', 'China', 'Italy', 'France', 'Hong Kong', 
    'Austria', 'Switzerland', 'Brazil', 'Australia', 'Japan', 'Russia', 'Indonesia', 
    'Germany', 'Philippines', 'Israel', 'Thailand', 'United Kingdom', 'United Arab Emirates', 
    'Singapore', 'Taiwan', 'Denmark', 'Malaysia', 'Nigeria', 'Spain', 'Sweden', 'Norway', 
    'New Zealand', 'Czech Republic', 'South Korea', 'Argentina', 'Colombia', 'Egypt', 
    'Eswatini', 'Ireland', 'Belize', 'Chile', 'Poland', 'Kazakhstan', 'Turkey', 'Monaco', 
    'Georgia', 'Venezuela', 'Finland', 'Cyprus', 'Vietnam', 'South Africa', 'Greece', 
    'Lebanon', 'Guernsey', 'Iceland', 'Bulgaria', 'Ukraine', 'Zimbabwe', 'Tanzania', 'Croatia', 
    'Romania', 'Netherlands', 'Peru', 'St. Kitts and Nevis', 'Hungary'
]

# Function to fetch GDP data for a specific country and year
def fetch_gdp_data(country, year=1930):
    url = f'https://countryeconomy.com/gdp/{country.replace(" ", "-").lower()}?year={year}'
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        print(f"Error fetching data for {country}: {response.status_code}")
        return None
# Function to remove the last line from the text file
def remove_last_line(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # If the file has at least one line, remove the last line
    if lines:
        lines = lines[:-1]

    # Re-write the file without the last line
    with open(file_path, 'w') as file:
        file.writelines(lines)

        
# Function to remove the first 4 lines from the text file
def remove_first_n_lines(file_path, n=4):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Skip the first `n` lines
    if len(lines) > n:
        lines = lines[n:]

    # Re-write the file without the first `n` lines
    with open(file_path, 'w') as file:
        file.writelines(lines)

# Loop through each country in the list
for country in countries:
    print(f"Scraping data for {country}...")

    # Fetch the GDP data for the country and year 1930
    soup = fetch_gdp_data(country)

    if soup:
        # Open the file in write mode (this will create the file if it doesn't exist)
        file_path = f'{country}_gdp_1930.txt'
        with open(file_path, 'w') as file:
            # Write the year at the top of the file
            file.write(f"GDP Data for {country} ({1930})\n\n")
            
            # Find the table with the specified classes
            table = soup.find('table', class_='table tabledat table-striped table-condensed table-hover')

            # Check if the table exists
            if table:
                # Extract all rows from the table
                rows = table.find_all('tr')

                # Loop through each row and extract the columns (td)
                for row in rows:
                    cols = row.find_all('td')  # Extract all table data cells
                    cols = [col.text.strip() for col in cols]  # Clean the text (strip unnecessary whitespace)

                    if cols:  # Only write rows that contain data
                        # Remove any rows where any column ends with 'M' or '%'
                        valid_cols = [col for col in cols if not (col.endswith('%') or col.endswith('M'))]
                        if valid_cols:  # Write to file only if there are valid columns left
                            file.write(f"Year: {1930}, Data: {valid_cols}\n")
            
            # Now search for the 'numero dol' class within the body of the page
            body = soup.find('body')  # Find the <body> tag
            if body:
                numero_dol_elements = body.find_all(class_='numero dol')  # Search for elements with the 'numero dol' class

                if numero_dol_elements:
                    for element in numero_dol_elements:
                        element_text = element.text.strip()

                        # Skip elements that end with '%' or 'M'
                        if element_text.endswith('%') or element_text.endswith('M'):
                            continue  # Skip this element
                        
                        # Write the valid elements to the file
                        file.write(f"Found valid 'numero dol' element: {element_text}\n")
                else:
                    file.write(f" No elements with class 'numero dol' found.\n")
            else:
                file.write(f"No body found in the page.\n")

        # After exporting the data, remove the first 4 lines from the file
        remove_first_n_lines(file_path, n=6)
        remove_last_line(file_path)

    else:
        print(f"No data found for {country}.")


Scraping data for United States...
Scraping data for India...
Scraping data for Canada...
Scraping data for China...
Scraping data for Italy...
Scraping data for France...
Scraping data for Hong Kong...
Scraping data for Austria...
Scraping data for Switzerland...
Scraping data for Brazil...
Scraping data for Australia...
Scraping data for Japan...
Scraping data for Russia...
Scraping data for Indonesia...
Scraping data for Germany...
Scraping data for Philippines...
Scraping data for Israel...
Scraping data for Thailand...
Scraping data for United Kingdom...
Scraping data for United Arab Emirates...
Scraping data for Singapore...
Scraping data for Taiwan...
Scraping data for Denmark...
Scraping data for Malaysia...
Scraping data for Nigeria...
Scraping data for Spain...
Scraping data for Sweden...
Scraping data for Norway...
Scraping data for New Zealand...
Scraping data for Czech Republic...
Scraping data for South Korea...
Scraping data for Argentina...
Scraping data for Colombia...

In [None]:
import pandas as pd

# File path
file_path = "/Users/youssefabdelrazik/Downloads/APINY/wrld.csv"  # Replace with the path to your file

# Load the dataset with error handling
try:
    # Attempt to load CSV with proper handling for bad lines
    df = pd.read_csv(file_path, delimiter=',', header=0, on_bad_lines='skip', quotechar='"')

    # Check the first few rows to understand the structure


    # Filter data for United States
    us_data = df[df['Country Name'] == 'United States']

    # Select data from 1970 to 2023
    years = [str(year) for year in range(1970, 2024)]
    columns_to_keep = ['Country Name', 'Indicator Name'] + years
    us_data_filtered = us_data[columns_to_keep]

    # Save to a new CSV file
    output_file = "us_gdp_1970_2023.csv"
    us_data_filtered.to_csv(output_file, index=False)
    print(f"Filtered data saved to {output_file}")

except Exception as e:
    print(f"Error reading the CSV file: {e}")


Error reading the CSV file: 'Country Name'


In [284]:
import pandas as pd
dfusa=pd.read_csv('/Users/youssefabdelrazik/Downloads/xsx.csv')
usa=dfusa[dfusa['Country Name']=='United States']

usa=usa.drop(['Country Code','Indicator Name','Indicator Code'],axis=1)
usa.head(2)

Unnamed: 0,Country Name,1960,1961,1962,1963,1964,1965,1966,1967,1968,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,Unnamed: 68
251,United States,2999.864872,3059.160821,3236.013112,3366.36979,3565.31443,3818.288251,4136.308296,4325.959351,4684.588403,...,57040.208214,58206.614193,60322.261424,63201.045848,65548.070785,64317.398913,71055.876194,77246.673883,81695.187071,


In [236]:
import re
import pandas as pd

# File name
input_file = "Argentina_gdp_1930.txt"
output_file = "Argentina_GDP3.csv"

# Regular expression pattern



pattern = r"Found valid 'numero dol' element: \$(\d{1,3}(?:,\d{3})*)"


# Read the file
with open(input_file, 'r') as file:
    data = file.read()

# Find all matches using the regular expression
matches = re.findall(pattern, data)

# Convert matches to a list of GDP values as integers (removing commas)
gdp_values = [int(gdp.replace(',', '')) for gdp in matches]

# Create a DataFrame with descending years starting from 2023
years = list(range(2023, 2023 - len(gdp_values), -1))
df = pd.DataFrame({"Year": years, "GDP": gdp_values})

# Save to a CSV file
df.to_csv(output_file, index=False)

print(f"Data has been extracted and saved to {output_file}.")


Data has been extracted and saved to Argentina_GDP3.csv.


In [None]:
import re
import pandas as pd

# File name
input_file = "Argentina_gdp_1930.txt"
output_file = "Argentina_GDP3.csv"

# Regular expression pattern



pattern = r"Found valid 'numero dol' element: \$(\d{1,3}(?:,\d{3})*)"


# Read the file
with open(input_file, 'r') as file:
    data = file.read()

# Find all matches using the regular expression
matches = re.findall(pattern, data)

# Convert matches to a list of GDP values as integers (removing commas)
gdp_values = [int(gdp.replace(',', '')) for gdp in matches]

# Create a DataFrame with descending years starting from 2023
years = list(range(2023, 2023 - len(gdp_values), -1))
df = pd.DataFrame({"Year": years, "GDP": gdp_values})

# Save to a CSV file
df.to_csv(output_file, index=False)

print(f"Data has been extracted and saved to {output_file}.")


In [216]:
import pandas as pd
import os

# Load all country-specific GDP CSV files into a dictionary of DataFrames
input_directory='./'
gdp_data = {}
for country in countries:
    # Generate the expected filename
    filename = f"{country.replace(' ', '_')}_gdp.csv"
    input_path = os.path.join(input_directory, filename)

    # Check if the file exists before loading
    if os.path.exists(input_path):
        gdp_data[country] = pd.read_csv(input_path)
    else:
        print(f"File not found: {input_path}")
    
# Example DataFrame
# Replace this with your actual DataFrame
dff=pd.read_csv('/Users/youssefabdelrazik/mmet.csv')

# Define custom era ranges
era_ranges = {
    "Early 1930s": list(range(1930, 1935)),
    "Late 1930s": list(range(1935, 1940)),
    "Early 1940s": list(range(1940, 1945)),
    "Late 1940s": list(range(1945, 1950)),
    "Early 1950s": list(range(1950, 1955)),
    "Late 1950s": list(range(1955, 1960)),
    "Early 1960s": list(range(1960, 1965)),
    "Late 1960s": list(range(1965, 1970)),
    "Early 1970s": list(range(1970, 1975)),
    "Late 1970s": list(range(1975, 1980)),
    "Early 1980s": list(range(1980, 1985)),
    "Late 1980s": list(range(1985, 1990)),
    "Early 1990s": list(range(1990, 1995)),
    "Late 1990s": list(range(1995, 2000))
}

# Function to calculate GDP for a given country and Era
def calculate_average_gdp(country, era):
    if country not in gdp_data:
        return None  # Return None if GDP data for the country is not available

    # Get the year range for the specified Era
    years = era_ranges.get(era)
    if not years:
        return None  # Return None for unsupported Era values

    # Get the country's GDP data
    country_gdp = gdp_data[country]

    # Filter GDP data for the specified years and calculate the average
    avg_gdp = country_gdp[country_gdp["Year"].isin(years)]["GDP"].mean()
    return avg_gdp

# Apply the function to calculate GDP for each row in the DataFrame
dff["gdp"] = dff.apply(lambda row: calculate_average_gdp(row["countryOfCitizenship"], row["Era"]), axis=1)

# Output the resulting DataFrame
dff.head(4)


File not found: ./Eswatini_gdp.csv
File not found: ./Guernsey_gdp.csv
File not found: ./St._Kitts_and_Nevis_gdp.csv


KeyError: 'Year'

In [210]:
print(dff.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Name', 'countryOfCitizenship', 'gender',
       'Era', 'Education'],
      dtype='object')
