In [80]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = 'https://edurank.org/geo/de-berlin/'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all university blocks
    university_blocks = soup.find_all('div', class_='block-cont')
    
    # List to store the scraped data
    universities_info = []
    
    for block in university_blocks:
        # Extract the university name
        university_name_tag = block.find('h2', class_='h4 font-weight-bold text-center')
        if university_name_tag and university_name_tag.find('a'):
            university_name = university_name_tag.find('a').text.strip()
        else:
            university_name = 'N/A'
        
        # Extract the rankings
        ranks = block.find_all('div', class_='uni-card__rank')
        rank_in_berlin_brandenburg = ranks[0].text.strip() if len(ranks) > 0 else 'N/A'
        rank_in_germany = ranks[1].text.strip() if len(ranks) > 1 else 'N/A'
        
        # Clean up the rankings to extract just the number
        rank_in_berlin_brandenburg = rank_in_berlin_brandenburg.split('#')[-1].strip()
        rank_in_germany = rank_in_germany.split('#')[-1].strip()
        
        # Append the extracted information to the list
        universities_info.append({
            'University Name': university_name,
            'Rank in Berlin-Brandenburg': rank_in_berlin_brandenburg,
            'Rank in Germany': rank_in_germany
        })
    
    # Print the scraped information
    for uni in universities_info:
        print(f"University Name: {uni['University Name']}")
        print(f"Rank in Berlin-Brandenburg: {uni['Rank in Berlin-Brandenburg']}")
        print(f"Rank in Germany: {uni['Rank in Germany']}")
        print("-" * 50)
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

University Name: N/A
Rank in Berlin-Brandenburg: N/A
Rank in Germany: N/A
--------------------------------------------------
University Name: 1. Free University of Berlin
Rank in Berlin-Brandenburg: 1 in Brandenburg
Rank in Germany: 5 in Germany
--------------------------------------------------
University Name: 2. Humboldt University of Berlin
Rank in Berlin-Brandenburg: 2 in Brandenburg
Rank in Germany: 8 in Germany
--------------------------------------------------
University Name: 3. Technical University of Berlin
Rank in Berlin-Brandenburg: 3 in Brandenburg
Rank in Germany: 19 in Germany
--------------------------------------------------
University Name: 4. Charite - Medical University of Berlin
Rank in Berlin-Brandenburg: 4 in Brandenburg
Rank in Germany: 21 in Germany
--------------------------------------------------
University Name: 5. Berlin University of Applied Sciences
Rank in Berlin-Brandenburg: 7 in Brandenburg
Rank in Germany: 84 in Germany
-----------------------------

In [82]:
import requests
from bs4 import BeautifulSoup
import csv
import os  # Import the os module

# URL of the webpage to scrape
url = 'https://edurank.org/geo/de-berlin/'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all university blocks
    university_blocks = soup.find_all('div', class_='block-cont')
    
    # List to store the scraped data
    universities_info = []
    
    for block in university_blocks:
        # Extract the university name
        university_name_tag = block.find('h2', class_='h4 font-weight-bold text-center')
        if university_name_tag and university_name_tag.find('a'):
            university_name = university_name_tag.find('a').text.strip()
        else:
            university_name = 'N/A'
        
        # Extract the rankings
        ranks = block.find_all('div', class_='uni-card__rank')
        rank_in_berlin_brandenburg = ranks[0].text.strip() if len(ranks) > 0 else 'N/A'
        rank_in_germany = ranks[1].text.strip() if len(ranks) > 1 else 'N/A'
        
        # Clean up the rankings to extract just the number
        rank_in_berlin_brandenburg = rank_in_berlin_brandenburg.split('#')[-1].strip()
        rank_in_germany = rank_in_germany.split('#')[-1].strip()
        
        # Append the extracted information to the list
        universities_info.append({
            'University Name': university_name,
            'Rank in Berlin-Brandenburg': rank_in_berlin_brandenburg,
            'Rank in Germany': rank_in_germany
        })
    
    # Specify the exact path to the Downloads directory
    downloads_path = '/Users/peterscheinsohn/Downloads'
    
    # Save the scraped information to a CSV file in the specified Downloads directory
    file_path = os.path.join(downloads_path, 'universities_berlin_rating.csv')
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['University Name', 'Rank in Berlin-Brandenburg', 'Rank in Germany'])
        writer.writeheader()
        for uni in universities_info:
            writer.writerow(uni)
    
    print(f"Data has been saved to {file_path}")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Data has been saved to /Users/peterscheinsohn/Downloads/universities_berlin_rating.csv


In [84]:
import pandas as pd

# Load your CSV file into a DataFrame
df = pd.read_csv('universities_berlin_rating.csv')

# Display the initial DataFrame
print("Initial DataFrame:")
print(df)

# Remove rows where 'University Name' or any of the ratings are 'N/A'
df_cleaned = df.dropna(subset=['University Name', 'Rank in Berlin-Brandenburg', 'Rank in Germany'])

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('universities_berlin_rating_cleaned.csv', index=False)

# Display the cleaned DataFrame
print("\nCleaned DataFrame:")
print(df_cleaned)

Initial DataFrame:
                                      University Name  \
0                                                 NaN   
1                        1. Free University of Berlin   
2                    2. Humboldt University of Berlin   
3                   3. Technical University of Berlin   
4           4. Charite - Medical University of Berlin   
5            5. Berlin University of Applied Sciences   
6               6. Berlin School of Economics and Law   
7   7. Berlin Technical University of Applied Scie...   
8                    8. Berlin University of the Arts   
9                      9. Hertie School of Governance   
10                           10. Steinbeis University   
11                                    11. ESMT Berlin   
12                            12. Bard College Berlin   
13      13. SRH University of Applied Sciences Berlin   
14      14. University of Performing Arts Ernst Busch   
15        15. International Psychoanalytic University   
16          

In [None]:
import pandas as pd

# Load your CSV file into a DataFrame
df = pd.read_csv('universities_berlin_rating_cleaned.csv')

# Display the initial DataFrame
print("Initial DataFrame:")
print(df)

# Function to extract numeric part from a string
def extract_numeric(value):
    if pd.isna(value):
        return None
    return int(''.join(filter(str.isdigit, str(value))))

# Apply the function to the rating columns
df['Rank in Berlin-Brandenburg'] = df['Rank in Berlin-Brandenburg'].apply(extract_numeric)
df['Rank in Germany'] = df['Rank in Germany'].apply(extract_numeric)

# Convert the rating columns to integers
df['Rank in Berlin-Brandenburg'] = pd.to_numeric(df['Rank in Berlin-Brandenburg'], errors='coerce')
df['Rank in Germany'] = pd.to_numeric(df['Rank in Germany'], errors='coerce')

# Save the updated DataFrame to a new CSV file
df.to_csv('universities_berlin_rating_final.csv', index=False)

# Display the cleaned DataFrame
print("\nFinal DataFrame:")
print(df)

Initial DataFrame:
                                      University Name  \
0                        1. Free University of Berlin   
1                    2. Humboldt University of Berlin   
2                   3. Technical University of Berlin   
3           4. Charite - Medical University of Berlin   
4            5. Berlin University of Applied Sciences   
5               6. Berlin School of Economics and Law   
6   7. Berlin Technical University of Applied Scie...   
7                    8. Berlin University of the Arts   
8                      9. Hertie School of Governance   
9                            10. Steinbeis University   
10                                    11. ESMT Berlin   
11                            12. Bard College Berlin   
12      13. SRH University of Applied Sciences Berlin   
13      14. University of Performing Arts Ernst Busch   
14        15. International Psychoanalytic University   
15              16. Psychological Institute in Berlin   
16  17. Alic

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load your existing CSV file into a DataFrame
existing_df = pd.read_csv('universities_Berlin_final.csv')

# URL of the website to scrape
url = 'https://edurank.org/geo/de-berlin/'

# Send a request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table or list containing the university data
university_data = []
universities = soup.find_all('div', class_='university-item')  # Adjust the selector as needed

for uni in universities:
    name = uni.find('h2', class_='uni-name').text.strip()
    acceptance_rate = uni.find('span', class_='acceptance-rate').text.strip()
    enrollment = uni.find('span', class_='enrollment').text.strip()
    founded = uni.find('span', class_='founded').text.strip()
    
    # Convert to integers or dates as needed
    acceptance_rate = int(acceptance_rate.replace('%', ''))
    enrollment = int(enrollment.replace(',', ''))
    founded = pd.to_datetime(founded, format='%Y')
    
    university_data.append([name, acceptance_rate, enrollment, founded])

# Convert the scraped data to a DataFrame
new_df = pd.DataFrame(university_data, columns=['University Name', 'Acceptance Rate', 'Enrollment', 'Founded'])

# Merge the new data with the existing DataFrame
merged_df = pd.merge(existing_df, new_df, on='University Name', how='left')

# Save the updated DataFrame to a new CSV file
merged_df.to_csv('Berlin_Universities_full_info.csv', index=False)