# Notebook for crawling the UCI Machine Learning repository website + Convinient functions for downloading, summarizing or viewing information about various datasets

### Impoer libraries

In [297]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests
import os

### Open the connection, read the HTML and parse it using BeautifulSoup library

In [298]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Read the HTML from the URL and pass on to BeautifulSoup
url = 'https://archive.ics.uci.edu/ml/datasets.html?format=&task=&att=&area=&numAtt=&numIns=&type=&sort=nameUp&view=list'
print("Opening the file connection...")
uh= urllib.request.urlopen(url, context=ctx)
print("HTTP status",uh.getcode())
html =uh.read()
print(f"Reading done. Total {len(html)} characters read.")

# Soupify!
soup = BeautifulSoup(html, 'html5lib')

Opening the file connection...
HTTP status 200
Reading done. Total 114429 characters read.


### Read all _p_-tag contents and extract dataset name, url, and description from them 

In [299]:
lst=[]
for tag in soup.find_all('p'):
    lst.append(tag.contents)

i=0
description_dict={}
dataset_list=[]
for l in lst:
    if len(l)>2:
        if str(l[1]).find('datasets/')!=-1:
            string=str(l[1])
            s=re.search('">.*</a>',string)
            x,y=s.span()
            description_dict[string[x+2:y-4]]=(l[2])[2:]
            s=re.search("\".*\"",string)
            x,y=s.span()
            dataset_list.append(string[x+10:y-1])
            i+=1
print(f"{i} datasets read")

425 datasets read


### Function for scraping through individual dataset pages to extract the _Abstract_ and _dataurl_ (the page where actual data files reside)

In [300]:
def dataset_page_crawl(dataset):
    dataset_dict={}
    baseurl='https://archive.ics.uci.edu/ml/datasets/'
    url = baseurl+dataset
    dataset_dict['Dataset Page']=url
    #print("Opening the page:", url)
    try:
        uh= urllib.request.urlopen(url, context=ctx)
        html =uh.read().decode()
        soup=BeautifulSoup(html,'html5lib')
        #print(soup.text[:200])
        if soup.text.find("does not appear to exist")!=-1:
            print(f"{dataset} not found")
            return None
        else:
            dataurls = []
            for pclass in soup.find_all('p'):
                if (pclass.get_text().find('Abstract: '))!=-1:
                    dataset_dict['Abstract']=str(pclass.get_text())[len('Abstract: '):]
                    break # Breaking here is crucial. Otherwise the loop will progress through other p tags
                    # and put 'Not found' in the Abstract field
                else:
                    dataset_dict['Abstract']='Not found!'
            for link in soup.find_all('a'):
                if link.attrs['href'].find('machine-learning-databases')!=-1:
                    a=link.attrs['href']
                    a=a[2:]
                    dataurl="https://archive.ics.uci.edu/ml/"+a
                    dataurls.append(dataurl)
            
            # After finishing the for-loop with a-tags, the first dataurl is added to the dictionary 
            dataset_dict['dataurl']=dataurls[0]
            
        return dataset_dict
    except:
        print("Could not retrieve")
        return None

### Crawl through all the dataset pages in the dataset list, store the abstract and dataset urls in a dictionary

In [301]:
i=0
baseurl='https://archive.ics.uci.edu/ml/datasets/'
dataset_dicts=[]
for dataset in dataset_list:
    a=dataset_page_crawl(dataset)
    if a!=None:
        dataset_dicts.append(a)
        i+=1
        print(f"Dataset {i} processed",end=', ')

print("\nTotal datasets analyzed: ",i)

Dataset 1 processed, Dataset 2 processed, Dataset 3 processed, Dataset 4 processed, Dataset 5 processed, Dataset 6 processed, Dataset 7 processed, Dataset 8 processed, Dataset 9 processed, Dataset 10 processed, Dataset 11 processed, Dataset 12 processed, Dataset 13 processed, Dataset 14 processed, Dataset 15 processed, Dataset 16 processed, Dataset 17 processed, Dataset 18 processed, Dataset 19 processed, Dataset 20 processed, Dataset 21 processed, Dataset 22 processed, Dataset 23 processed, Dataset 24 processed, Dataset 25 processed, Dataset 26 processed, Dataset 27 processed, Dataset 28 processed, Dataset 29 processed, Dataset 30 processed, Dataset 31 processed, Dataset 32 processed, Dataset 33 processed, Dataset 34 processed, Dataset 35 processed, Dataset 36 processed, Dataset 37 processed, Dataset 38 processed, Dataset 39 processed, Dataset 40 processed, Dataset 41 processed, Dataset 42 processed, Dataset 43 processed, Dataset 44 processed, Dataset 45 processed, Dataset 46 processe

Dataset 305 processed, Dataset 306 processed, Dataset 307 processed, Dataset 308 processed, Dataset 309 processed, Dataset 310 processed, Dataset 311 processed, Dataset 312 processed, Dataset 313 processed, Dataset 314 processed, Dataset 315 processed, Dataset 316 processed, Dataset 317 processed, Dataset 318 processed, Dataset 319 processed, Dataset 320 processed, Dataset 321 processed, Dataset 322 processed, Dataset 323 processed, Dataset 324 processed, Dataset 325 processed, Dataset 326 processed, Dataset 327 processed, Dataset 328 processed, Dataset 329 processed, Dataset 330 processed, Dataset 331 processed, Dataset 332 processed, Dataset 333 processed, Dataset 334 processed, Dataset 335 processed, Dataset 336 processed, Dataset 337 processed, Dataset 338 processed, Dataset 339 processed, Dataset 340 processed, Dataset 341 processed, Dataset 342 processed, Dataset 343 processed, Dataset 344 processed, Dataset 345 processed, Dataset 346 processed, Dataset 347 processed, Dataset 348

### Create dataframes to store abstracts and dataset url links for all the items, analyzed above

#### Dataframe with _Abstract_ and _dataurl_ (the url of the page where the actual data files reside)

In [302]:
df_dataset = pd.DataFrame(data=dataset_dicts)

#### dataframe with _name_ and _Abstract_ (description)

In [303]:
df_description=pd.DataFrame(data=list(description_dict.items()),columns=['Dataset','Abstract'])

#### Merge these two dataframes on the common _Abstract_ column

In [304]:
df_joined=df_description.merge(df_dataset,on='Abstract')
df_joined

Unnamed: 0,Dataset,Abstract,Dataset Page,dataurl
0,"3D Road Network (North Jutland, Denmark)",3D road network with highly accurate elevation...,https://archive.ics.uci.edu/ml/datasets/3D+Roa...,https://archive.ics.uci.edu/ml//machine-learni...
1,AAAI 2013 Accepted Papers,This data set compromises the metadata for the...,https://archive.ics.uci.edu/ml/datasets/AAAI+2...,https://archive.ics.uci.edu/ml//machine-learni...
2,AAAI 2014 Accepted Papers,This data set compromises the metadata for the...,https://archive.ics.uci.edu/ml/datasets/AAAI+2...,https://archive.ics.uci.edu/ml//machine-learni...
3,Abalone,Predict the age of abalone from physical measu...,https://archive.ics.uci.edu/ml/datasets/Abalone,https://archive.ics.uci.edu/ml//machine-learni...
4,Abscisic Acid Signaling Network,The objective is to determine the set of boole...,https://archive.ics.uci.edu/ml/datasets/Abscis...,https://archive.ics.uci.edu/ml//machine-learni...
5,Activities of Daily Living (ADLs) Recognition ...,This dataset comprises information regarding t...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...
6,Activity Recognition from Single Chest-Mounted...,The dataset collects data from a wearable acce...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...
7,Activity Recognition system based on Multisens...,This dataset contains temporal data from a Wir...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...
8,Activity recognition with healthy older people...,Sequential motion data from 14 healthy older p...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...
9,Acute Inflammations,The data was created by a medical expert as a ...,https://archive.ics.uci.edu/ml/datasets/Acute+...,https://archive.ics.uci.edu/ml//machine-learni...


## Dataset Information Extraction from Tables

In [305]:
# Read the HTML from the URL and pass on to BeautifulSoup
url = 'https://archive.ics.uci.edu/ml/datasets.html'
print("Opening the file connection...")
uh= urllib.request.urlopen(url, context=ctx)
print("HTTP status",uh.getcode())
html =uh.read()
print(f"Reading done. Total {len(html)} characters read.")

soup = BeautifulSoup(html, 'html5lib')

Opening the file connection...
HTTP status 200
Reading done. Total 364147 characters read.


In [306]:
lst=[]
for tag in soup.find_all('p'):
    lst.append(tag.contents)

In [307]:
lst_stats=[]
lst_dataset=[]
for l in lst:
    if str(l).find('datasets/')!=-1:
        
        string=str(l)
        s=re.search('">.*</a>',string)
        x,y=s.span()
        lst_dataset.append(string[x+2:y-4])
        
        lst_stats.append([x[0].replace(u'\xa0', u'') for x in lst[lst.index(l)+1:lst.index(l)+7]])

### Create dataframes to store information about the datasets

#### Info dataframe

In [308]:
df_stats=pd.DataFrame(data=lst_stats,columns=['Data Type','Machine Learning Task',
                                              'Feature Type(s)','Number of samples','Number of features','Year'])

#### Names dataframe

In [309]:
df_names=pd.DataFrame(data=lst_dataset,columns=['Dataset'])

#### Merged dataframe

In [310]:
df_stats_joined=pd.concat([df_names,df_stats],axis=1)

### Now, merge this name & info dataset with the previously created Abstract/dataurl dataframe. Perform an 'inner join' to match by _Abstract_ column

In [311]:
df_combined=df_joined.merge(df_stats_joined,on='Dataset')
df_combined

Unnamed: 0,Dataset,Abstract,Dataset Page,dataurl,Data Type,Machine Learning Task,Feature Type(s),Number of samples,Number of features,Year
0,"3D Road Network (North Jutland, Denmark)",3D road network with highly accurate elevation...,https://archive.ics.uci.edu/ml/datasets/3D+Roa...,https://archive.ics.uci.edu/ml//machine-learni...,"Sequential, Text","Regression, Clustering",Real,434874,4,2013
1,AAAI 2013 Accepted Papers,This data set compromises the metadata for the...,https://archive.ics.uci.edu/ml/datasets/AAAI+2...,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Clustering,,150,5,2014
2,AAAI 2014 Accepted Papers,This data set compromises the metadata for the...,https://archive.ics.uci.edu/ml/datasets/AAAI+2...,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Clustering,,399,6,2014
3,Abalone,Predict the age of abalone from physical measu...,https://archive.ics.uci.edu/ml/datasets/Abalone,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Classification,"Categorical, Integer, Real",4177,8,1995
4,Abscisic Acid Signaling Network,The objective is to determine the set of boole...,https://archive.ics.uci.edu/ml/datasets/Abscis...,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Causal-Discovery,Integer,300,43,2008
5,Activities of Daily Living (ADLs) Recognition ...,This dataset comprises information regarding t...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...,"Multivariate, Sequential, Time-Series","Classification, Clustering",,2747,,2013
6,Activity Recognition from Single Chest-Mounted...,The dataset collects data from a wearable acce...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...,"Univariate, Sequential, Time-Series","Classification, Clustering",Real,,,2014
7,Activity Recognition system based on Multisens...,This dataset contains temporal data from a Wir...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...,"Multivariate, Sequential, Time-Series",Classification,Real,42240,6,2016
8,Activity recognition with healthy older people...,Sequential motion data from 14 healthy older p...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...,Sequential,Classification,Real,75128,9,2016
9,Acute Inflammations,The data was created by a medical expert as a ...,https://archive.ics.uci.edu/ml/datasets/Acute+...,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Classification,"Categorical, Integer",120,6,2009


### Transform the type of _Number of samples_, _Number of features_, and _Year_ to integers for sorting and easy searching

In [312]:
df_combined['Year']=pd.to_numeric(df_combined['Year'],errors='coerce')
df_combined['Number of samples']=pd.to_numeric(df_combined['Number of samples'],errors='coerce')
df_combined['Number of features']=pd.to_numeric(df_combined['Number of features'],errors='coerce')

In [313]:
df_combined

Unnamed: 0,Dataset,Abstract,Dataset Page,dataurl,Data Type,Machine Learning Task,Feature Type(s),Number of samples,Number of features,Year
0,"3D Road Network (North Jutland, Denmark)",3D road network with highly accurate elevation...,https://archive.ics.uci.edu/ml/datasets/3D+Roa...,https://archive.ics.uci.edu/ml//machine-learni...,"Sequential, Text","Regression, Clustering",Real,434874.0,4.0,2013.0
1,AAAI 2013 Accepted Papers,This data set compromises the metadata for the...,https://archive.ics.uci.edu/ml/datasets/AAAI+2...,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Clustering,,150.0,5.0,2014.0
2,AAAI 2014 Accepted Papers,This data set compromises the metadata for the...,https://archive.ics.uci.edu/ml/datasets/AAAI+2...,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Clustering,,399.0,6.0,2014.0
3,Abalone,Predict the age of abalone from physical measu...,https://archive.ics.uci.edu/ml/datasets/Abalone,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Classification,"Categorical, Integer, Real",4177.0,8.0,1995.0
4,Abscisic Acid Signaling Network,The objective is to determine the set of boole...,https://archive.ics.uci.edu/ml/datasets/Abscis...,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Causal-Discovery,Integer,300.0,43.0,2008.0
5,Activities of Daily Living (ADLs) Recognition ...,This dataset comprises information regarding t...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...,"Multivariate, Sequential, Time-Series","Classification, Clustering",,2747.0,,2013.0
6,Activity Recognition from Single Chest-Mounted...,The dataset collects data from a wearable acce...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...,"Univariate, Sequential, Time-Series","Classification, Clustering",Real,,,2014.0
7,Activity Recognition system based on Multisens...,This dataset contains temporal data from a Wir...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...,"Multivariate, Sequential, Time-Series",Classification,Real,42240.0,6.0,2016.0
8,Activity recognition with healthy older people...,Sequential motion data from 14 healthy older p...,https://archive.ics.uci.edu/ml/datasets/Activi...,https://archive.ics.uci.edu/ml//machine-learni...,Sequential,Classification,Real,75128.0,9.0,2016.0
9,Acute Inflammations,The data was created by a medical expert as a ...,https://archive.ics.uci.edu/ml/datasets/Acute+...,https://archive.ics.uci.edu/ml//machine-learni...,Multivariate,Classification,"Categorical, Integer",120.0,6.0,2009.0


### File download helper function

In [314]:
def download_file(url,directory):
    local_filename = directory+'/'+url.split('/')[-1]
    # NOTE the stream=True parameter
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                #f.flush()
    #return local_filename

### Function for downloading the data set from a page

In [315]:
def download_from_url(url,dataset):    
    
    cwd = os.getcwd()
    local_directory = cwd+'\\'+dataset
    if not os.path.exists(local_directory):
        os.makedirs(local_directory)

    uh= urllib.request.urlopen(url, context=ctx)
    html =uh.read().decode()
    soup=BeautifulSoup(html,'html5lib')

    links=[]
    for link in soup.find_all('a'):
        links.append(link.attrs['href'])
    
    links_to_download=[]

    if 'Index' in links:
        idx=links.index('Index')
    else:
        idx=len(links)-2
    for i in range(idx+1,len(links)):
        links_to_download.append(url+str(links[i]))

    for file_url in links_to_download:
        download_file(file_url,local_directory)
    
    print (f"Downloaded dataset from {url}")

### Function to download data by dataset name

In [316]:
def download_dataset(dataset):
    
    try:
        url=df_combined[df_combined['Dataset']==dataset]['Dataset Page'].values[0]
    except:
        print("Something wrong in the dataset name!")
        return None
    
    print(f"Downloading the dataset: {dataset}")
    download_from_url(url, dataset)

### Testing the download function

In [317]:
download_dataset('Wireless Indoor Localization')

Downloading the dataset: Wireless Indoor Localization
Downloaded dataset from https://archive.ics.uci.edu/ml/datasets/Wireless+Indoor+Localization


### Function for finding more information for any dataset

In [318]:
def dataset_information(dataset):
    
    try:
        url=df_combined[df_combined['Dataset']==dataset]['Dataset Page'].values[0]
    except:
        print("Something wrong in the dataset name!")
        return None
    
    try:
        dataset_info={}
        uh= urllib.request.urlopen(url, context=ctx)
        html =uh.read()
        soup = BeautifulSoup(html, 'html5lib')
        
        tag=soup.find('p',text='Data Set Information:')
        if tag!=None:
            text=tag.find_next('p').get_text()
            print("DATASET INFORMATION")
            print("="*50)
            print(text)
            print("="*50)
            print("\n")
            dataset_info['Dataset info']=text
        else:
            print("No information about the dataset could be found on its page!")
        
        tag=soup.find('p',text='Source:')
        if tag!=None:
            text=tag.find_next('p').get_text()
            print("SOURCE")
            print("="*50)
            print(text)
            print("="*50)
            print("\n")
            dataset_info['Dataset Source']=text
        else:
            print("No information about the source of dataset could be found on its page!")
        
        return dataset_info
    except:
        print("Could not connect")

In [319]:
d=dataset_information('Iris')

DATASET INFORMATION
This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda & Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.

Predicted attribute: class of iris plant.

This is an exceedingly simple domain.

This data differs from the data presented in Fishers article (identified by Steve Chadwick,  spchadwick '@' espeedaz.net ).  The 35th sample should be: 4.9,3.1,1.5,0.2,"Iris-setosa" where the error is in the fourth feature. The 38th sample: 4.9,3.6,1.4,0.1,"Iris-setosa" where the errors are in the second and third features.  


SOURCE
Creator: 

R.A. Fisher

Donor: 

Michael Marshall (MARSHALL%PLU '@' io.arc.nasa.gov)


