In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# General Information

In [None]:
url = 'https://www.geostat.ge/ka'
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    html = response.text
    print(html)  # This will print the HTML content of the page
else:
    print(f'Failed to retrieve webpage. Status code: {response.status_code}')


In [None]:


# Parse the HTML content
soup = BeautifulSoup(html, 'html.parser')

# Example: Extract all links, h3 tags and p tags from the page within a specific section class
links = soup.find_all('section', class_='home-statistic-category')
links_list = []
h3_list = []
p_list = []
for link in links:
    # 'href' attribute is inside 'a' tags within 'section' tags
    for a_tag in link.find_all('a'):
      links_list.append(a_tag.get('href'))

    for h3 in link.find_all('h3'):
      h3_list.append(h3.get_text(separator='<'))

    for value in link.find_all('p'):
      myvalue = value.getText(separator='<').replace(" ", "")
      p_list.append(float(myvalue))

print(links_list)
print(h3_list)
print(p_list)

In [None]:

data_dict = dict(zip(h3_list, p_list))
data = pd.DataFrame(data_dict, index=[0])

data

In [None]:
links_list[0]

## Population

In [None]:
url = links_list[0]
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    html = response.text
    print(html)  # This will print the HTML content of the page
else:
    print(f'Failed to retrieve webpage. Status code: {response.status_code}')


In [None]:
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')

# Find the table element
table = soup.find('table')

def has_georgian(text):
    # Regular expression pattern to match Georgian characters
    georgian_pattern = re.compile(r'[\u10A0-\u10FF\u1C90-\u1CBF]+')

    # Check if the text matches the Georgian pattern
    return bool(georgian_pattern.search(text))

indexes = []
# Iterate over each <td> tag within the table
for row in table.find_all('td'):
    value = row.text.strip()  # Extract text and strip whitespace
    if has_georgian(value):
      indexes.append(value)
      print(value)  # Print the string value as is


In [None]:

# Read HTML tables into pandas DataFrame
dfs = pd.read_html(links_list[0])

# Since read_html() returns a list of DataFrames, we assume here there is only one table
if dfs:
    df = dfs[0]  # Assuming there is only one table in the HTML content
    df.columns = df.iloc[0]
    df = df.iloc[1:]
    print("DataFrame from HTML Table:")
    print(df)
else:
    print("No tables found in the HTML content.")


In [None]:
indexes

In [None]:
df.columns = ['Features', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
       '2024']

mydict = {'Features': indexes}

df.loc[:, 'Features'] = mydict['Features']

df

In [None]:
def convert_to_float(x):
  try:
    return float(x.replace(' ', '').replace(',', '.'))
  except ValueError:
    return None

# Convert columns from 2016 to 2024 to floats using .loc
for col in df.columns[1:]:  # Exclude the 'Features' column
    df.loc[:, col] = df[col].apply(convert_to_float)

df

In [None]:
import matplotlib.pyplot as plt
df.columns = ['Features'] + list(range(2016, 2025))

# Transpose the DataFrame for plotting (if needed)
df_t = df.transpose()

# Extract x (years) and y (values) for plotting
x = df_t.columns[1:]  # Years (from 2016 to 2024)
y = df_t.iloc[1:, 0]

# Plot the data
plt.plot(df.columns[1:], y)
plt.xlabel('Year')
plt.ylabel('Population (Thousands)')
plt.title('Population Over Years (Thousands)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Transpose the DataFrame for plotting (if needed)
df_t = df.transpose()

x = df_t.columns[1:]
y = df_t.iloc[1:, 3]

# Plot bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(df.columns[1:], y)

# Add values on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, round(yval, 2), ha='center', va='bottom')

# Set title and labels
plt.title('Weddings Over Years')
plt.ylabel('Weddings')
plt.xlabel('Year')
plt.grid(axis='y')

# Show the plot
plt.show()