# Download the Modern Slavery Dataset

In [None]:
# Run only once
# !pip install pandas
# !pip install boto3

In [4]:
import pandas
import boto3

In [122]:
s3_bucket = 'modern-slavery-dataset'

s3_client = boto3.client('s3')

response = s3_client.list_objects_v2(Bucket=s3_bucket)

print("Modern Slavery Text Corpus is available in the following formats: \n")

contents = response.get('Contents')
for file in contents:
    print(file['Key'])

Modern Slavery Text Corpus is available in the following formats: 

modern_slavery_dataset.csv
modern_slavery_dataset.json




Pandas treats the NULL values differently in the two formats, so you can use one that you prefer.



## Option 1 - Download the CSV

In [48]:
s3_client.download_file(Bucket=s3_bucket, Key='modern_slavery_dataset.csv', Filename='data/modern_slavery_dataset.csv')

In [73]:
df_csv = pandas.read_csv('data/modern_slavery_dataset.csv')

In [114]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28417 entries, 0 to 28416
Data columns (total 15 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Company ID                                    28417 non-null  int64  
 1   Company                                       27528 non-null  object 
 2   Is Publisher                                  27528 non-null  object 
 3   Statement ID                                  27528 non-null  float64
 4   URL                                           28417 non-null  object 
 5   Override URL                                  18 non-null     object 
 6   Companies House Number                        21887 non-null  object 
 7   Industry                                      27528 non-null  object 
 8   HQ                                            27528 non-null  object 
 9   Is Also Covered                               27528 non-null 

In [115]:
df_csv.head()

Unnamed: 0,Company ID,Company,Is Publisher,Statement ID,URL,Override URL,Companies House Number,Industry,HQ,Is Also Covered,UK Modern Slavery Act,California Transparency in Supply Chains Act,Australia Modern Slavery Act,Period Covered,Text
0,7676,"""K"" Line Holding Europe Limited",True,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5005018.0,Marine,United Kingdom,False,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
1,28660,"""K"" Line Bulk Shipping (UK) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,4830352.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
2,28659,"""K"" Line (Europe) Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,5639474.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
3,28661,"""K"" Line LNG Shipping Limited",False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
4,28658,Polar LNG Shipping (UK) Limited,False,35092.0,https://img1.wsimg.com/blobby/go/7695baff-3f0f...,,2205323.0,Marine,United Kingdom,True,True,False,False,2018-2019,K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...


## Option 2 - Download in JSON format.

In [65]:
s3_client.download_file(Bucket=s3_bucket, Key='modern_slavery_dataset.json', Filename='data/modern_slavery_dataset.json')

In [88]:
df_json = pandas.read_json('data/modern_slavery_dataset.json')

In [117]:
df_json.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28417 entries, 0 to 28416
Data columns (total 15 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Company ID                                    28417 non-null  int64  
 1   Company                                       27528 non-null  object 
 2   Is Publisher                                  27528 non-null  object 
 3   Statement ID                                  27528 non-null  float64
 4   URL                                           28417 non-null  object 
 5   Override URL                                  18 non-null     object 
 6   Companies House Number                        21887 non-null  object 
 7   Industry                                      27528 non-null  object 
 8   HQ                                            27528 non-null  object 
 9   Is Also Covered                               27528 non-null 

0        K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
1        K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
2        K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
3        K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
4        K LINE\nHOLDINC (EUROPE) LTD.\nModern Slavery ...
                               ...                        
28412                                                 None
28413                                                 None
28414                                                 None
28415                                                 None
28416                                                 None
Name: Text, Length: 28417, dtype: object