In [11]:
pip install lxml

Collecting lxml
  Downloading lxml-5.3.1-cp311-cp311-macosx_10_9_universal2.whl.metadata (3.7 kB)
Downloading lxml-5.3.1-cp311-cp311-macosx_10_9_universal2.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-5.3.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
#import required libraries

from bs4 import BeautifulSoup
import requests
import random
import pandas as pd

In [3]:
#check we can get the data from the page

#here we use 'query' for the end of the url, this allows us to quickly change it
query = 'Linguistic_diversity_index'

url = 'https://en.wikipedia.org/wiki/Linguistic_diversity_index' + query
response = requests.get(url)
bs_html = BeautifulSoup(response.text, features="html.parser")

#this grabs us the html of the entire page

In [4]:
# we can preview the html contents
print(bs_html.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-not-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Linguistic diversity indexLinguistic diversity index - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpr

In [5]:
#this will check if the request was sucsessful. we want it to be 200, or at least start with a 2... anything else is a problem.

print(response.status_code)

404


In [6]:
#now I want to find only the link on this page
#first we create an array for the links

links = []

#we are looking for all of the <a> anchor tags.
# we do this with a for loop, we use 'try' and 'except' as some of the anchors may not have an 'href'. we ignore these otherwise it could cause an error.

for a in bs_html.find_all("a"):
    try:
        links.append(a["href"])
    except:
        pass


    #then another for loop to cycle though the array and print each link
for link in links:
    print(link)

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Special:SpecialPages
/wiki/Main_Page
/wiki/Special:Search
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Linguistic+diversity+indexLinguistic+diversity+index
/w/index.php?title=Special:UserLogin&returnto=Linguistic+diversity+indexLinguistic+diversity+index
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Linguistic+diversity+indexLinguistic+diversity+index
/w/index.php?title=Special:UserLogin&returnto=Linguistic+diversity+indexLinguistic+diversity+index
/wiki/

In [9]:
#many of the links are from outside wikipedia. in this case we only want internal links

#we can then filter the array to only include links starting with /wiki/. so only internal links will show.

filtered = []

for link in links:
  if link.startswith('/wiki/'):
    filtered.append(link)

for f in filtered:
    print(f)

/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Special:SpecialPages
/wiki/Main_Page
/wiki/Special:Search
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
/wiki/Special:WhatLinksHere/Linguistic_diversity_indexLinguistic_diversity_index
/wiki/Special:SiteMatrix
/wiki/Wikipedia:User_access_levels#Autoconfirmed_users
/wiki/Wikipedia:Article_wizard
/wiki/Wikipedia:Requested_articles
/wiki/Special:WhatLinksHere/Linguistic_diversity_indexLinguistic_diversity_index
/wiki/Special:Purge/Linguistic_diversity_indexLinguistic_diversity_index
/wiki/Case_sensitivity
/wiki/Wikipedia:Redirect
/wiki/Wikipedia:Why_was_the_page_I_created_deleted%3F
/wiki/Wikipedia:About
/wiki/Wikipedia:General_disclaimer


In [8]:
#there are still a lot of links to stuff we dont want eg. pictures, help files ect. We can use ignore to filter them out.

ignores = ['png', 'jpg', 'jpeg', 'isbn', 'svg', 'identifier', \
           'File', 'Special', 'Template', 'Mailto', 'Portal', \
           'Help', 'Category', 'Talk', 'Wikipedia', 'Main_Page']

filtered = []

#this line states only links that are to wiki pages are valid
for link in links:
    if link.startswith('/wiki/'):
        valid = True

        # this line then makes all our ingnored links invalid
        for ignore in ignores:
            if ignore in link:
                valid = False
                break

        # if the link is valid we then add it to our 'filtered' array
        if valid:
            filtered.append(link)

for f in filtered:
    print(f)

/wiki/Case_sensitivity


In [58]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/Linguistic_diversity_index"

# check the request was sucsessful (code 200)
response=requests.get(wikiurl)
print(response.status_code)

# parse data from the html into a beautifulsoup object
bs_html = BeautifulSoup(response.text, 'html.parser')

# here we find any element with the table tag, there are some of these we dont want on this page.
# So we specify only tables using the "wikitable" class

tabledata=bs_html.find('table',{'class':"wikitable"})

#read the table data
df=pd.read_html(str(tabledata))

# convert list to pandas dataframe
df=pd.DataFrame(df[0])
print(df.head())

#write the data to a .csv file
df.to_csv('Linguistic_diversity_index.csv', sep='\t', encoding='utf-8')

200
   Rank          Country / region    LDI
0     1          Papua New Guinea  0.988
1     2                  Cameroon  0.974
2     3                   Vanuatu  0.973
3     4           Solomon Islands  0.968
4     5  Central African Republic  0.959


  df=pd.read_html(str(tabledata))


In [62]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('Linguistic_diversity_index.csv')

# Rename the 'Country / region' column
df.rename(columns={'Country / region': 'Country_region'}, inplace=True)

# Save the updated CSV file
df.to_csv('updated_Linguistic_diversity_index.csv', index=False)
print(df.head())


           \tRank\tCountry / region\tLDI
0          0\t1\tPapua New Guinea\t0.988
1                  1\t2\tCameroon\t0.974
2                   2\t3\tVanuatu\t0.973
3           3\t4\tSolomon Islands\t0.968
4  4\t5\tCentral African Republic\t0.959


In [59]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('Linguistic_diversity_index.csv')

# Replace the '/' in the column name with an underscore '_'
df.rename(columns={'Country / region': 'Country_region'}, inplace=True)

# Save the updated CSV file
df.to_csv('updated_Linguistic_diversity_index.csv', index=False)

In [52]:
# descriptive statistics of linguistic diversity 
df[["LDI"]].describe() 

Unnamed: 0,LDI
count,232.0
mean,0.430108
std,0.297934
min,0.0
25%,0.16225
50%,0.449
75%,0.69725
max,0.988


In [36]:
from collections import Counter
#Median
median = df["LDI"].median()
print(median)

0.449


In [57]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('Linguistic_diversity_index.csv')

# Remove the '/' from the column name
df.rename(columns={'Country / region/': 'Country_or_reagion'}, inplace=True)

# Save the updated CSV file
df.to_csv('Linguistic_diversity_index.csv', index=False)
print(df.head())

           \tRank\tCountry / region\tLDI
0          0\t1\tPapua New Guinea\t0.988
1                  1\t2\tCameroon\t0.974
2                   2\t3\tVanuatu\t0.973
3           3\t4\tSolomon Islands\t0.968
4  4\t5\tCentral African Republic\t0.959


In [None]:
import pandas as pd
import seaborn as sns

df = pd.read_csv('Linguistic_diversity_index.csv')

sns.set(rc={'figure.figsize':(16,9)})

g = df.groupby('Country', as_index=False)['LDI'].sum().sort_values(by='LDI', ascending=False).head(10)
sns.barplot(data=g, x='Country', y='LDI', hue='Country / region', dodge=False).set(xticklabels=[]);

KeyError: 'Country'

In [37]:
import matplotlib.pyplot as plt

# Plot linguistic diversity by country
LDI.sort_values().plot(kind="bar", figsize=(12, 6), color="orange")
plt.title("Linguistic diversity by country")
plt.xlabel("Country")
plt.ylabel("LDI")
plt.xticks(rotation=45)  # Rotate state names for better readability
plt.show()


NameError: name 'LDI' is not defined

In [22]:
import pandas as pd
import matplotlib.pyplot as plt

# Read data from CSV file
df = pd.read_csv('Linguistic_diversity_index.csv')

# Extract the relevant columns
countries = data["Country / region"].tolist()
LDI = data["LDI"].tolist()

# Sort the data in descending order
LDI.sort(reverse=True)
sorted_countries = [countries[i] for i in sorted(range(len(countries)), key=lambda i: LDI[i], reverse=True)]

# Create a bar chart
plt.bar(sorted_states, firespots)
plt.title('LDI by country')
plt.xlabel('Country / region')
plt.ylabel('LDI')
plt.xlabel("x",labelpad=100)
plt.show()


KeyError: 'Country / region'

In [23]:
import pandas as pd
import matplotlib.pyplot as plt

# Read data from CSV file
data = pd.read_csv('Linguistic_diversity_index.csv')

# Extract the relevant columns
LDI = data["LDI"].tolist()

# Sort the data in descending order
LDI.sort(reverse=True)

# Create a bar chart
plt.bar(sorted_states, firespots)
plt.title('LDI by country')
plt.xlabel('Country / region')
plt.ylabel('LDI')
plt.xlabel("x",labelpad=100)
plt.show()


KeyError: 'LDI'