In [2]:
!pip install tabulate requests beautifulsoup4 pandas



In [3]:
import requests

url = 'https://cloud.google.com/vision/docs/languages'
headers = {'User-Agent': 'Mozilla/5.0'}  # Some servers require a user-agent header
response = requests.get(url, headers=headers)
html_content = response.content



In [4]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_content, 'html.parser')
tables = soup.find_all('table')


In [5]:
import pandas as pd

dataframes = []
for table in tables:
    # Extract table headers
    headers = [th.get_text(strip=True) for th in table.find_all('th')]

    # Extract table rows
    rows = []
    for tr in table.find_all('tr'):
        cells = tr.find_all(['td', 'th'])
        row = [cell.get_text(strip=True) for cell in cells]
        if row:
            rows.append(row)

    # Adjust rows to match the headers
    if headers:
        adjusted_rows = []
        for row in rows:
            if len(row) < len(headers):
                row.extend([''] * (len(headers) - len(row)))  # Pad with empty strings
            elif len(row) > len(headers):
                row = row[:len(headers)]  # Truncate extra columns
            adjusted_rows.append(row)
        df = pd.DataFrame(adjusted_rows[1:], columns=headers)
    else:
        df = pd.DataFrame(rows)
    dataframes.append(df)

# Output the tables
for i, df in enumerate(dataframes):
    print(f"Table {i+1}")
    print(df.head())  # Print the first few rows
    df.to_csv(f'table_{i+1}.csv', index=False)


Table 1
     Language Language (English name) languageHintscode Script  \
0   Afrikaans               Afrikaans                af   Latn   
1       shqip                Albanian                sq   Latn   
2     العربية                  Arabic                ar   Arab   
3         Հայ                Armenian                hy   Armn   
4  беларуская              Belarusian                be   Cyrl   

             Notes  
0                   
1                   
2  Modern Standard  
3                   
4                   
Table 2
                  Language Language (English name) languageHintscode Script  \
0                     አማርኛ                 Amharic                am   Ethi   
1          Αρχαία ελληνικά           Ancient Greek               grc   Grek   
2                  অসমীয়া                Assamese                as   Beng   
3               Azərbaycan             Azerbaijani                az   Latn   
4  Azərbaycan (qədim yazı)             Azerbaijani           az-Cy

In [6]:
from tabulate import tabulate

# Beautify and print each table
for i, df in enumerate(dataframes):
    print(f"\nTable {i+1}:")
    print(tabulate(df, headers='keys', tablefmt='fancy_grid', showindex=False))



Table 1:
╒═════════════════════════════╤═══════════════════════════╤═════════════════════╤═══════════╤═════════════════╕
│ Language                    │ Language (English name)   │ languageHintscode   │ Script    │ Notes           │
╞═════════════════════════════╪═══════════════════════════╪═════════════════════╪═══════════╪═════════════════╡
│ Afrikaans                   │ Afrikaans                 │ af                  │ Latn      │                 │
├─────────────────────────────┼───────────────────────────┼─────────────────────┼───────────┼─────────────────┤
│ shqip                       │ Albanian                  │ sq                  │ Latn      │                 │
├─────────────────────────────┼───────────────────────────┼─────────────────────┼───────────┼─────────────────┤
│ العربية                     │ Arabic                    │ ar                  │ Arab      │ Modern Standard │
├─────────────────────────────┼───────────────────────────┼─────────────────────┼───────────┼─

In [7]:
import pandas as pd

# Select tables with indexes 0, 1, and 2
tables_to_merge = [dataframes[0], dataframes[1], dataframes[2]]

# Standardize column names across all selected tables
for i, table in enumerate(tables_to_merge):
    tables_to_merge[i].columns = ['Language', 'Language (English name)', 'languageHintscode', 'Script', 'Notes']

# Add a 'Mapping' column to each DataFrame
for i, table in enumerate(tables_to_merge):
    mapping_value = 'Mapped' if i == 2 else ''
    tables_to_merge[i]['Mapping'] = mapping_value

# Merge tables
merged_table = pd.concat(tables_to_merge, ignore_index=True)

# Print the merged table
print("\nMerged Table:")
print(tabulate(merged_table, headers='keys', tablefmt='fancy_grid', showindex=True))



Merged Table:
╒═════╤═══════════════════════════════════════════════════╤═════════════════════════════════╤═════════════════════╤═══════════╤═════════════════╤═══════════╕
│     │ Language                                          │ Language (English name)         │ languageHintscode   │ Script    │ Notes           │ Mapping   │
╞═════╪═══════════════════════════════════════════════════╪═════════════════════════════════╪═════════════════════╪═══════════╪═════════════════╪═══════════╡
│   0 │ Afrikaans                                         │ Afrikaans                       │ af                  │ Latn      │                 │           │
├─────┼───────────────────────────────────────────────────┼─────────────────────────────────┼─────────────────────┼───────────┼─────────────────┼───────────┤
│   1 │ shqip                                             │ Albanian                        │ sq                  │ Latn      │                 │           │
├─────┼──────────────────────────────

In [8]:
# # Add a new column 'Support Level' to the merged table with default value 'N/A'
# merged_table['Support Level'] = 'N/A'

# # Table with index 3
# table_3 = dataframes[3]

# # Iterate over the merged table and update 'Support Level' based on matching Script values
# for index, row in merged_table.iterrows():
#     if row['Script'] in table_3['Script'].values:
#         merged_table.at[index, 'Support Level'] = 'Supports Handwriting'

# # Print the updated merged table
# print("\nUpdated Merged Table:")
# print(tabulate(merged_table, headers='keys', tablefmt='fancy_grid', showindex=True))

# Add a new column "Support Level" to the merged table, initialized with empty strings
merged_table['Support Level'] = ''

# Create a mapping from the table with index 3 (Script -> Support Level)
script_support_mapping = dict(zip(dataframes[3]['Script'], dataframes[3]['Support Level']))

# Populate the "Support Level" column in the merged table
merged_table['Support Level'] = merged_table['Script'].map(script_support_mapping).fillna('')

# Print the updated merged table
print("\nUpdated Merged Table with Support Level:")
print(tabulate(merged_table, headers='keys', tablefmt='fancy_grid', showindex=True))



Updated Merged Table with Support Level:
╒═════╤═══════════════════════════════════════════════════╤═════════════════════════════════╤═════════════════════╤═══════════╤═════════════════╤═══════════╤═════════════════╕
│     │ Language                                          │ Language (English name)         │ languageHintscode   │ Script    │ Notes           │ Mapping   │ Support Level   │
╞═════╪═══════════════════════════════════════════════════╪═════════════════════════════════╪═════════════════════╪═══════════╪═════════════════╪═══════════╪═════════════════╡
│   0 │ Afrikaans                                         │ Afrikaans                       │ af                  │ Latn      │                 │           │ Supported       │
├─────┼───────────────────────────────────────────────────┼─────────────────────────────────┼─────────────────────┼───────────┼─────────────────┼───────────┼─────────────────┤
│   1 │ shqip                                             │ Albanian          

In [9]:
# Sort the merged table by the 'Language (English name)' column
sorted_table = merged_table.sort_values(by='languageHintscode', ascending=True)

# Save the sorted table to a CSV file
output_filename = "sorted_merged_table.csv"
sorted_table.to_csv(output_filename, index=False)

# Print confirmation message
print(f"The sorted table has been saved to '{output_filename}'.")


The sorted table has been saved to 'sorted_merged_table.csv'.


In [11]:
import json

# Initialize the base structure for the JSON
google_json = {}

# Iterate through the merged table to build the JSON structure
for index, row in sorted_table.iterrows():
    language_hint = row['languageHintscode']
    language = row['Language']
    language_name = row['Language (English name)']
    script = row['Script']
    mapping = row.get('Mapping', '')  # Assuming 'Mapping' is the column for Mapped status
    support_level = row['Support Level']

    # Start with the basic title format
    title = f"{language_name} / {language} / {script}"

    # Add / Mapped if the Mapping field is not empty
    if mapping:
        title += " / Mapped"
    
    # Add / Supports handwriting if the Support Level is 'Experimental' or 'Supported'
    if support_level in ['Experimental', 'Supported']:
        title += " / Supports handwriting"
    
    # Add this information to the google_json structure
    google_json[language_hint] = {
        "languages": [language_hint],
        "title": title
    }

# Save the JSON to a file
output_json_filename = "google_languages.json"
with open(output_json_filename, 'w', encoding='utf-8') as json_file:
    json.dump({"google": google_json}, json_file, ensure_ascii=False, indent=4)

# Print confirmation message
print(f"JSON file has been created and saved as '{output_json_filename}'.")


JSON file has been created and saved as 'google_languages.json'.
