<a href="https://colab.research.google.com/github/yusufOcakoglu/Deciding_the_MVP/blob/main/collect_polls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Wahlrecht.de - Forsa şirketinin anketleri (HTML sayfası)
url = "https://www.wahlrecht.de/umfragen/forsa.htm"

try:
    # Sayfadaki tüm tabloları bir liste olarak çeker.
    # decimal=',' ve thousands='.' ayarları Alman formatını okumak için kritik!
    tables = pd.read_html(url, decimal=',', thousands='.')

    # Almanya anket verisi genellikle listelenen ilk (index 0) veya ikinci (index 1) büyük tablodur.
    # Muhtemelen 1. index (tables[1]) sana ana veriyi verecektir.
    df_forsa = tables[1]

    print("✅ Forsa Anket Verisi (Almanya) başarıyla çekildi!")
    print("\nÇekilen Tablonun İlk 5 Satırı:")
    print(df_forsa.head())

    # CSV Olarak Kaydet (Bir sonraki adımda temizlemek için)
    df_forsa.to_csv("forsa_polls_germany_raw.csv", index=False)

except Exception as e:
    print(f"❌ Tablo çekilirken hata oluştu: {e}")

✅ Forsa Anket Verisi (Almanya) başarıyla çekildi!

Çekilen Tablonun İlk 5 Satırı:
  Unnamed: 0 Unnamed: 1 CDU/CSU   SPD GRÜNE  FDP LINKE   AfD FW  BSW Sonstige  \
0   25112025        NaN    25 %  14 %  12 %  3 %  11 %  26 %  –  3 %      6 %   
1   18112025        NaN    25 %  14 %  12 %  3 %  11 %  26 %  –    –      9 %   
2   11112025        NaN    24 %  14 %  12 %  3 %  11 %  26 %  –  3 %      7 %   
3   04112025        NaN    24 %  14 %  12 %  3 %  12 %  26 %  –  3 %      6 %   
4   28102025        NaN    25 %  13 %  12 %  3 %  12 %  26 %  –  3 %      6 %   

  Unnamed: 11 Nichtwähler/ Unentschl. Befragte       Zeitraum  
0         NaN                    26 %     2501  18.11.–24.11.  
1         NaN                    25 %     2502  11.11.–17.11.  
2         NaN                    24 %     2503  04.11.–10.11.  
3         NaN                    23 %     2500  28.10.–03.11.  
4         NaN                    23 %     2502  21.10.–27.10.  


In [1]:
pip install eurostat

Collecting eurostat
  Downloading eurostat-1.1.1-py3-none-any.whl.metadata (26 kB)
Downloading eurostat-1.1.1-py3-none-any.whl (16 kB)
Installing collected packages: eurostat
Successfully installed eurostat-1.1.1


In [3]:
import pandas as pd
import os

def process_migration_data(input_file, output_file):
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: The file '{input_file}' was not found.")
        return

    try:
        print(f"Reading {input_file}...")
        df = pd.read_csv(input_file)

        # distinct columns needed based on your request and the file snippet
        # geo = Country Code
        # TIME_PERIOD = Time Period
        # OBS_VALUE = Observation Value
        required_columns = ['geo', 'TIME_PERIOD', 'OBS_VALUE']

        # Check if columns exist
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            print(f"Error: The following columns were not found in the CSV: {missing_cols}")
            return

        # Extract only the necessary columns
        extracted_df = df[required_columns].copy()

        # --- Handling Missing Values ---

        # 1. Ensure OBS_VALUE is numeric.
        # Sometimes statistical data uses symbols like ":" or "b" for missing/break in series.
        # errors='coerce' turns these non-numeric values into NaN (Not a Number)
        extracted_df['OBS_VALUE'] = pd.to_numeric(extracted_df['OBS_VALUE'], errors='coerce')

        # 2. Check for missing values
        initial_count = len(extracted_df)
        missing_count = extracted_df['OBS_VALUE'].isna().sum()

        if missing_count > 0:
            print(f"Found {missing_count} rows with missing or invalid Observation Values.")

            # Option A: Drop rows with missing values (Selected approach)
            extracted_df = extracted_df.dropna(subset=['OBS_VALUE'])
            print("Dropped rows with missing values.")

            # Option B: Fill with 0 (Alternative - commented out)
            # extracted_df['OBS_VALUE'] = extracted_df['OBS_VALUE'].fillna(0)
        else:
            print("No missing values found.")

        # Aggregate OBS_VALUE for unique geo and TIME_PERIOD combinations
        # This handles cases where multiple entries exist for the same period and country.
        extracted_df = extracted_df.groupby(['geo', 'TIME_PERIOD'], as_index=False)['OBS_VALUE'].sum()
        print("Aggregated duplicate entries for 'geo' and 'TIME_PERIOD'.")

        # Sort by Time and Geo for better readability
        extracted_df = extracted_df.sort_values(by=['geo', 'TIME_PERIOD'])

        # Save to new CSV
        extracted_df.to_csv(output_file, index=False)

        print(f"\nSuccess! Processed data saved to '{output_file}'")
        print(f"Original rows before aggregation: {initial_count}")
        print(f"Final rows after processing and aggregation: {len(extracted_df)}")
        print("\nFirst 5 rows of the new file:")
        print(extracted_df.head())

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    # Input filename based on your upload
    input_csv = 'migr_asyappctzm__custom_19122438_linear_2_0.csv'

    # Output filename
    output_csv = 'processed_migration_data.csv'

    process_migration_data(input_csv, output_csv)

Reading migr_asyappctzm__custom_19122438_linear_2_0.csv...
No missing values found.
Aggregated duplicate entries for 'geo' and 'TIME_PERIOD'.

Success! Processed data saved to 'processed_migration_data.csv'
Original rows before aggregation: 5064
Final rows after processing and aggregation: 518

First 5 rows of the new file:
  geo TIME_PERIOD  OBS_VALUE
0  DE     2015-01      99565
1  DE     2015-02     103520
2  DE     2015-03     127720
3  DE     2015-04     109105
4  DE     2015-05     105395


In [4]:
import pandas as pd
import os

def process_unemployment_data(input_file, output_file):
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: The file '{input_file}' was not found.")
        return

    try:
        print(f"Reading {input_file}...")
        df = pd.read_csv(input_file)

        # distinct columns needed based on your request and the file snippet
        # geo = Country Code
        # TIME_PERIOD = Time Period
        # OBS_VALUE = Observation Value
        required_columns = ['geo', 'TIME_PERIOD', 'OBS_VALUE']

        # Check if columns exist
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            print(f"Error: The following columns were not found in the CSV: {missing_cols}")
            return

        # Extract only the necessary columns
        extracted_df = df[required_columns].copy()

        # --- Handling Missing Values ---

        # 1. Ensure OBS_VALUE is numeric.
        # Sometimes statistical data uses symbols like ":" or "b" for missing/break in series.
        # errors='coerce' turns these non-numeric values into NaN (Not a Number)
        extracted_df['OBS_VALUE'] = pd.to_numeric(extracted_df['OBS_VALUE'], errors='coerce')

        # 2. Check for missing values
        initial_count = len(extracted_df)
        missing_count = extracted_df['OBS_VALUE'].isna().sum()

        if missing_count > 0:
            print(f"Found {missing_count} rows with missing or invalid Observation Values.")

            # Option A: Drop rows with missing values (Selected approach)
            extracted_df = extracted_df.dropna(subset=['OBS_VALUE'])
            print("Dropped rows with missing values.")

            # Option B: Fill with 0 (Alternative - commented out)
            # extracted_df['OBS_VALUE'] = extracted_df['OBS_VALUE'].fillna(0)
        else:
            print("No missing values found.")

        # Sort by Time and Geo for better readability
        extracted_df = extracted_df.sort_values(by=['geo', 'TIME_PERIOD'])

        # Save to new CSV
        extracted_df.to_csv(output_file, index=False)

        print(f"\nSuccess! Processed data saved to '{output_file}'")
        print(f"Original rows: {initial_count}")
        print(f"Final rows: {len(extracted_df)}")
        print("\nFirst 5 rows of the new file:")
        print(extracted_df.head())

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    # Input filename based on your new upload
    input_csv = 'une_rt_m_page_linear_2_0.csv'

    # Output filename
    output_csv = 'processed_unemployment_data.csv'

    process_unemployment_data(input_csv, output_csv)

Reading une_rt_m_page_linear_2_0.csv...
No missing values found.

Success! Processed data saved to 'processed_unemployment_data.csv'
Original rows: 517
Final rows: 517

First 5 rows of the new file:
  geo TIME_PERIOD  OBS_VALUE
0  DE     2015-01        4.5
1  DE     2015-02        4.5
2  DE     2015-03        4.5
3  DE     2015-04        4.4
4  DE     2015-05        4.4
