In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the webpage to scrape
url = 'https://edurank.org/geo/de-berlin/'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all university blocks
    university_blocks = soup.find_all('div', class_='block-cont')

    # List to store the scraped data
    universities_info = []

    for block in university_blocks:
        # Extract the university name
        university_name_tag = block.find('h2', class_='h4 font-weight-bold text-center')
        if university_name_tag and university_name_tag.find('a'):
            university_name = university_name_tag.find('a').text.strip()
        else:
            university_name = 'N/A'

        # Extract the rankings
        ranks = block.find_all('div', class_='uni-card__rank')
        rank_in_berlin_brandenburg = ranks[0].text.strip() if len(ranks) > 0 else 'N/A'
        rank_in_germany = ranks[1].text.strip() if len(ranks) > 1 else 'N/A'

        # Clean up the rankings to extract just the number
        rank_in_berlin_brandenburg = rank_in_berlin_brandenburg.split('#')[-1].strip()
        rank_in_germany = rank_in_germany.split('#')[-1].strip()

        # Extract additional information
        acceptance_rate_tag = block.find('dt', text='Acceptance Rate')
        acceptance_rate = acceptance_rate_tag.find_next('dd').text.strip() if acceptance_rate_tag else 'N/A'

        enrollment_tag = block.find('dt', text='Enrollment')
        enrollment = enrollment_tag.find_next('dd').text.strip() if enrollment_tag else 'N/A'

        founded_tag = block.find('dt', text='Founded')
        founded = founded_tag.find_next('dd').text.strip() if founded_tag else 'N/A'

        # Append the extracted information to the list
        universities_info.append({
            'University Name': university_name,
            'Rank in Berlin-Brandenburg': rank_in_berlin_brandenburg,
            'Rank in Germany': rank_in_germany,
            'Acceptance Rate': acceptance_rate,
            'Enrollment': enrollment,
            'Founded': founded
        })

    # Convert the list to a DataFrame
    df = pd.DataFrame(universities_info)

    # Save the DataFrame to a CSV file
    df.to_csv('berlin_universities_list.csv', index=False)

    print("Scraped dataset saved to berlin_universities_list.csv")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Scraped dataset saved to berlin_universities_list.csv


  acceptance_rate_tag = block.find('dt', text='Acceptance Rate')
  enrollment_tag = block.find('dt', text='Enrollment')
  founded_tag = block.find('dt', text='Founded')


In [10]:
df=pd.read_csv('berlin_universities_list.csv')
df.head(40)

Unnamed: 0,University Name,Rank in Berlin-Brandenburg,Rank in Germany,Acceptance Rate,Enrollment,Founded
0,,,,,,
1,1. Free University of Berlin,1 in Brandenburg,5 in Germany,15%,37908.0,1948.0
2,2. Humboldt University of Berlin,2 in Brandenburg,8 in Germany,,36232.0,1809.0
3,3. Technical University of Berlin,3 in Brandenburg,19 in Germany,,34842.0,1946.0
4,4. Charite - Medical University of Berlin,4 in Brandenburg,21 in Germany,,9340.0,1710.0
5,5. Berlin University of Applied Sciences,7 in Brandenburg,84 in Germany,,14167.0,1994.0
6,6. Berlin School of Economics and Law,9 in Brandenburg,92 in Germany,,12246.0,1971.0
7,7. Berlin Technical University of Applied Scie...,10 in Brandenburg,95 in Germany,,12833.0,1971.0
8,8. Berlin University of the Arts,11 in Brandenburg,98 in Germany,12%,4201.0,1975.0
9,9. Hertie School of Governance,12 in Brandenburg,100 in Germany,,586.0,2003.0


In [16]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('berlin_universities_list.csv')

# 1. Remove rows where the university name is 'N/A'
df = df[df['University Name'] != 'N/A']

# 2. Extract only the numeric part from the Rank columns and remove dots and trailing zeros
def extract_number(text):
    if pd.isna(text):
        return None
    match = re.search(r'\d+', text)
    return int(match.group()) if match else None

df['Rank in Berlin-Brandenburg'] = df['Rank in Berlin-Brandenburg'].apply(extract_number)
df['Rank in Germany'] = df['Rank in Germany'].apply(extract_number)

# 3. Clean the Founded column
def clean_founded(text):
    if pd.isna(text):
        return None
    match = re.search(r'\d+', str(text))  # Ensure text is converted to string
    return int(match.group()) if match else None

df['Founded'] = df['Founded'].apply(clean_founded)

# 4. Remove percentage sign from Acceptance Rate and convert to integer
def clean_acceptance_rate(text):
    if pd.isna(text):
        return None
    return int(re.sub(r'[^0-9]', '', text))

df['Acceptance Rate'] = df['Acceptance Rate'].apply(clean_acceptance_rate)

# 5. Ensure all numeric columns are integers
df['Rank in Berlin-Brandenburg'] = df['Rank in Berlin-Brandenburg'].dropna().astype(int)
df['Rank in Germany'] = df['Rank in Germany'].dropna().astype(int)
df['Founded'] = df['Founded'].dropna().astype(int)

# 6. Save the cleaned dataset to the same CSV file
df.to_csv('berlin_universities_clean.csv', index=False)

print("Cleaned dataset saved to berlin_universities_clean.csv")

Cleaned dataset saved to berlin_universities_clean.csv


In [22]:
df=pd.read_csv('berlin_universities_clean.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   University Name             36 non-null     object 
 1   Rank in Berlin-Brandenburg  36 non-null     float64
 2   Rank in Germany             36 non-null     float64
 3   Acceptance Rate             7 non-null      float64
 4   Enrollment                  36 non-null     object 
 5   Founded                     36 non-null     float64
dtypes: float64(4), object(2)
memory usage: 2.0+ KB


In [34]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('berlin_universities_clean.csv')

# 1. Remove trailing '.0' from the Rank columns and Founded column
df['Rank in Berlin-Brandenburg'] = df['Rank in Berlin-Brandenburg'].apply(lambda x: str(int(x)) if pd.notna(x) else x)
df['Rank in Germany'] = df['Rank in Germany'].apply(lambda x: str(int(x)) if pd.notna(x) else x)
df['Founded'] = df['Founded'].apply(lambda x: str(int(x)) if pd.notna(x) else x)

# 2. Save the cleaned dataset to a new CSV file
df.to_csv('berlin_universities_clean1.csv', index=False)

print("Cleaned dataset saved to berlin_universities_clean1.csv")

Cleaned dataset saved to berlin_universities_clean1.csv


In [50]:
df=pd.read_csv('universities_Berlin_final.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   University Name             36 non-null     object 
 1   Rank in Berlin-Brandenburg  36 non-null     int64  
 2   Rank in Germany             36 non-null     int64  
 3   Acceptance Rate             7 non-null      float64
 4   Enrollment                  36 non-null     object 
 5   Founded                     36 non-null     float64
 6   latitude                    36 non-null     float64
 7   longitude                   36 non-null     float64
 8   postcode                    32 non-null     float64
 9   neighborhood                36 non-null     object 
 10  AGS_Code                    36 non-null     int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 3.2+ KB


In [52]:
import pandas as pd

# Load the dataset
df = pd.read_csv('universities_Berlin_final.csv')

# Convert 'Founded' column to int64, ensuring no NaN values are present
df['Founded'] = df['Founded'].dropna().astype(int)

# Convert 'postcode' column to int64 only for non-null values
df['postcode'] = df['postcode'].apply(lambda x: int(x) if pd.notna(x) else x)

# Save the updated dataset to the same CSV file
df.to_csv('universities_Berlin_final.csv', index=False)

print("Updated dataset saved to universities_Berlin_final.csv")

Updated dataset saved to universities_Berlin_final.csv


In [53]:
import pandas as pd

# Load the dataset
df = pd.read_csv('universities_Berlin_final.csv')

# Convert 'postcode' column to string, strip trailing '.0', and replace NaN with 'unknown'
df['postcode'] = df['postcode'].astype(str).str.rstrip('.0').replace('nan', 'unknown')

# Save the updated dataset to the same CSV file
df.to_csv('universities_Berlin_final.csv', index=False)

print("Updated dataset saved to universities_Berlin_final.csv")

Updated dataset saved to universities_Berlin_final.csv
