# Data source: https://registry.opendata.aws/noaa-gsod/

In [1]:
import time
import s3fs
import pandas as pd 

In [4]:
# Initialize the S3 filesystem object with the connection parameters
fs = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-1'})
# Specify the S3 path to the file you want to access

In [5]:
# Defining initial path with current year to check available weather stations 
s3_path = "s3://noaa-gsod-pds/2024/"

In [6]:
# List the objects in the bucket with the specified prefix (current year)
try:
    files = fs.ls(s3_path)
except Exception as e:
    print("Error:", e)

In [8]:
files[:5]

['noaa-gsod-pds/2024/01001099999.csv',
 'noaa-gsod-pds/2024/01001499999.csv',
 'noaa-gsod-pds/2024/01002099999.csv',
 'noaa-gsod-pds/2024/01003099999.csv',
 'noaa-gsod-pds/2024/01006099999.csv']

In [9]:
# Opening every file contained in the bucket (current year) with pandas and saving it in an array 
# Let's check how long it takes
start_time = time.time()

dfs = []
for file in files:
    with fs.open(file, mode='rb') as f:
        data = pd.read_csv(f)
    dfs.append(data)
    
end_time = time.time()
end_time - start_time

2039.7869002819061

### As you can see in the previous printing, the previous process took some time (~ 30min)

In [13]:
# concatenating all data frames (from 2024) in the dfs array as a unique data frame 
gsod = pd.concat(dfs, ignore_index=True)

In [15]:
# Size of the data frame for 2024
gsod.shape[0]

1931730

### Let's check the latitude and the longitude of the data to see if Tanzania's longitude and latitud is in the range

In [16]:
# Tanzania's longitude ~ 35
gsod['LONGITUDE'].min(), gsod['LONGITUDE'].max()

(-179.9833333, 179.75)

In [17]:
# Tanzania's latitude ~ -6
(gsod['LATITUDE'].min(), gsod['LATITUDE'].max())

(-90.0, 83.65)

### Checking the name of the stations I could get the number of the stations. This will be useful to get the data (the csv files are named after the stations, ex: 063729099999.csv)

In [18]:
# Saving the station's name to get the data
stations = (gsod[gsod['NAME'].astype(str).str.contains(" TZ")]["STATION"].unique())
stations

array([63729099999, 63733099999, 63756099999, 63789099999, 63791099999,
       63801099999, 63832099999, 63845099999, 63862099999, 63866099999,
       63870099999, 63881099999, 63887099999, 63894099999, 63962099999,
       63971099999], dtype=object)

### Now let's repeat the process for the whole content in the s3 bucket to obtain all the data from Tanzania for every available year

In [20]:
# s3 bucket path
s3_path = "s3://noaa-gsod-pds/"

In [21]:
start = time.time()

# Navigate through all the folders in the bucket and get the data from Tanzania
try:
    files = []
    dfs = []
    folders = fs.ls(s3_path)
    folders.pop()

    # Filter out only CSV files
    for i, folder in enumerate(folders):
        for j, station in enumerate(stations):
            file = folder+"/"+str(station)+".csv"
            files.append(file)
            try:
                with fs.open(file, mode='rb') as f:
                    data = pd.read_csv(f)
                dfs.append(data)
            except Exception as a:
                # It will print the file name of the stations whose data is not contained in the given year
                print("File", a, " not in database") 
                pass    
            
        
except Exception as e:
    print("Error:", e)

end = time.time()

print(end - start)

File noaa-gsod-pds/1929/63729099999.csv  not in database
File noaa-gsod-pds/1929/63733099999.csv  not in database
File noaa-gsod-pds/1929/63756099999.csv  not in database
File noaa-gsod-pds/1929/63789099999.csv  not in database
File noaa-gsod-pds/1929/63791099999.csv  not in database
File noaa-gsod-pds/1929/63801099999.csv  not in database
File noaa-gsod-pds/1929/63832099999.csv  not in database
File noaa-gsod-pds/1929/63845099999.csv  not in database
File noaa-gsod-pds/1929/63862099999.csv  not in database
File noaa-gsod-pds/1929/63866099999.csv  not in database
File noaa-gsod-pds/1929/63870099999.csv  not in database
File noaa-gsod-pds/1929/63881099999.csv  not in database
File noaa-gsod-pds/1929/63887099999.csv  not in database
File noaa-gsod-pds/1929/63894099999.csv  not in database
File noaa-gsod-pds/1929/63962099999.csv  not in database
File noaa-gsod-pds/1929/63971099999.csv  not in database
File noaa-gsod-pds/1930/63729099999.csv  not in database
File noaa-gsod-pds/1930/6373309

### ~10 min. Much better, no?

In [24]:
# Let's concatenate all the data frames in dfs
gsod_tz = pd.concat(dfs, ignore_index=True)

In [25]:
# Let's save the data frame as .csv
gsod_tz.to_csv("../data/noaa.gsod-pds.csv")