In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Fetch the webpage
url = 'https://www.scrapethissite.com/pages/forms/'
page = requests.get(url)

# Parse the page content
soup = BeautifulSoup(page.text, 'html.parser')

# Find the table of countries
table = soup.find('table', class_='table')

# Extract the rows of the table (skipping the header row)
rows = table.find_all('tr')[1:]

# Create lists to store data
countries = []
years = []
populations = []
gdp = []

# Loop through the rows and extract data
for row in rows:
    cols = row.find_all('td')
    country = cols[0].get_text().strip()
    year = int(cols[1].get_text().strip())
    population = int(cols[2].get_text().strip().replace(',', ''))
    gdp_value = float(cols[3].get_text().strip().replace(',', ''))
    
    countries.append(country)
    years.append(year)
    populations.append(population)
    gdp.append(gdp_value)

# Create a DataFrame to store the scraped data
data = pd.DataFrame({
    'Country': countries,
    'Year': years,
    'Population': populations,
    'GDP': gdp
})

# Show the first few rows of the data
print(data.head())


              Country  Year  Population   GDP
0       Boston Bruins  1990          44  24.0
1      Buffalo Sabres  1990          31  30.0
2      Calgary Flames  1990          46  26.0
3  Chicago Blackhawks  1990          49  23.0
4   Detroit Red Wings  1990          34  38.0


In [12]:

avg_population = data.groupby('Country')['Population'].mean().reset_index()
print(avg_population)


                  Country  Population
0           Boston Bruins        40.0
1          Buffalo Sabres        31.0
2          Calgary Flames        38.5
3      Chicago Blackhawks        42.5
4       Detroit Red Wings        34.0
5         Edmonton Oilers        37.0
6        Hartford Whalers        31.0
7       Los Angeles Kings        46.0
8   Minnesota North Stars        27.0
9      Montreal Canadiens        39.0
10      New Jersey Devils        32.0
11     New York Islanders        25.0
12       New York Rangers        36.0
13    Philadelphia Flyers        33.0
14    Pittsburgh Penguins        41.0
15       Quebec Nordiques        16.0
16        St. Louis Blues        47.0
17    Toronto Maple Leafs        23.0
18      Vancouver Canucks        28.0
19    Washington Capitals        37.0
20          Winnipeg Jets        26.0


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Check available countries
print(data['Country'].unique())

# Filter for a valid country name (adjust based on the actual name from the unique values)
country_data = data[data['Country'] == 'USA']  # Example: Use the correct country name

# Ensure the data isn't empty
if not country_data.empty:
    # Prepare features (Year) and target (Population)
    X = country_data[['Year']]
    y = country_data['Population']
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the linear regression model
    model = LinearRegression()

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

    # Display predictions
    predictions = pd.DataFrame({
        'Year': X_test['Year'],
        'Actual Population': y_test,
        'Predicted Population': y_pred
    })

    print(predictions)
else:
    print("No data found for the specified country.")


['Boston Bruins' 'Buffalo Sabres' 'Calgary Flames' 'Chicago Blackhawks'
 'Detroit Red Wings' 'Edmonton Oilers' 'Hartford Whalers'
 'Los Angeles Kings' 'Minnesota North Stars' 'Montreal Canadiens'
 'New Jersey Devils' 'New York Islanders' 'New York Rangers'
 'Philadelphia Flyers' 'Pittsburgh Penguins' 'Quebec Nordiques'
 'St. Louis Blues' 'Toronto Maple Leafs' 'Vancouver Canucks'
 'Washington Capitals' 'Winnipeg Jets']
No data found for the specified country.


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


country_data = data[data['Country'] == 'USA']  


if not country_data.empty:
    
    X = country_data[['Year']]
    y = country_data['Population']

   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    poly = PolynomialFeatures(degree=3)  
    X_poly = poly.fit_transform(X_train)
    
    poly_model = LinearRegression()
    poly_model.fit(X_poly, y_train)

    
    y_poly_pred = poly_model.predict(poly.transform(X_test))
    poly_mse = mean_squared_error(y_test, y_poly_pred)
    print(f'Polynomial Regression Mean Squared Error: {poly_mse}')

    
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    
    y_rf_pred = rf_model.predict(X_test)
    rf_mse = mean_squared_error(y_test, y_rf_pred)
    print(f'Random Forest Regression Mean Squared Error: {rf_mse}')

   
    svr_model = SVR(kernel='rbf')  
    svr_model.fit(X_train, y_train)

    
    y_svr_pred = svr_model.predict(X_test)
    svr_mse = mean_squared_error(y_test, y_svr_pred)
    print(f'Support Vector Regression Mean Squared Error: {svr_mse}')

    
    gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    gb_model.fit(X_train, y_train)

   
    y_gb_pred = gb_model.predict(X_test)
    gb_mse = mean_squared_error(y_test, y_gb_pred)
    print(f'Gradient Boosting Regression Mean Squared Error: {gb_mse}')

else:
    print("No data found for the specified country.")


No data found for the specified country.
