In [59]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.cloud import storage
import os

# Onjective of this program is to extract data from given URL using web scraping and save csv version of that data set in the Google cloud 
# ----------------------------------------------------------------------------------------------------------------------------------------

# Step 1: Web Scraping 

# used below URL of the web page to extract data 
url = "https://www.statistics.gov.lk/Agriculture/StaticalInformation/PaddyStatistics/Annual-BothSeasons-1952-2015"

# Web Scraping 
response = requests.get(url)
response.raise_for_status()  # Check for request errors

soup = BeautifulSoup(response.text, 'html.parser')

tables = soup.find_all('table')
table = tables[-1]  # Last Table taken to eliminate unwanted dats set 

# Extract table data
data = []
for row in table.find_all('tr'):
    cells = row.find_all(['th', 'td'])
    cell_data = [cell.get_text(strip=True) for cell in cells]
    if len(cell_data) > 3:  # Filter out unwanted rows
        data.append(cell_data)

# Ensure rows have equal columns
max_columns = max(len(row) for row in data)
for row in data:
    row += [""] * (max_columns - len(row))  # Fill missing cells with empty strings

# Create DataFrame and set first row as header
df = pd.DataFrame(data[1:], columns=data[0])

# Convert DataFrame to CSV string
csv_data = df.to_csv(index=False)

# ---------------------------------------------------------------------------------------------------------------
# Setp 2 - Upload csv file to Google Cloud

# Google Cloud authentication
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/suneth.jayamanne/msc-project-portfolio-8c9eb0909b58.json"

# Initialize Google Cloud Storage client
client = storage.Client()

# Define bucket and destination filename
bucket_name = "paddy_statistics"  
destination_blob_name = "paddy_statistics_1952_2015.csv"

# Upload the data to the Google Cloud Storage
bucket = client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_string(csv_data, content_type='text/csv')

print(f"Dataset uploaded to GCS bucket '{bucket_name}' as '{destination_blob_name}'!")


Dataset uploaded to GCS bucket 'paddy_statistics' as 'paddy_statistics_1952_2015.csv'!


In [69]:
# Step 3 - Download the file from GCS to working folder 

downloaded_file_path = "D:/Personal/MSc/MSc. Data Science/Lecture Notes/2. Principles of DS/Assignment and Passpapers/Assignments/Portfolio/paddy_statistics_1952_2015_downloaded.csv"
blob.download_to_filename(downloaded_file_path)
print(f"File downloaded from GCS to '{downloaded_file_path}'")

File downloaded from GCS to 'D:/Personal/MSc/MSc. Data Science/Lecture Notes/2. Principles of DS/Assignment and Passpapers/Assignments/Portfolio/paddy_statistics_1952_2015_downloaded.csv'


In [73]:
# Step 4 - Load the cleansed file to do EDA
file_path = "D:/Personal/MSc/MSc. Data Science/Lecture Notes/2. Principles of DS/Assignment and Passpapers/Assignments/Portfolio/paddy_statistics_1952_2015_cleansed.csv"
df = pd.read_csv(file_path)
print("cleansed file loaded")

cleansed file loaded
