# **Import Libraries**

In [1]:
import requests as req
import pandas as pd
import csv
import os
from tqdm import tqdm_notebook
from bs4 import BeautifulSoup as bs
from datetime import datetime as dt
from requests.structures import CaseInsensitiveDict

# **Functions**

In [2]:
def get_html(data, lat, lon):
  return req.get("https://sharaku.eorc.jaxa.jp/cgi-bin/trmm/GSMaP_CLM//show_graph_CLM_v1.cgi?term=DLY&seldate="+date+"&lat0="+lat+"&lon0="+lon+"&lang=en").text

def download_data(url):
  headers = CaseInsensitiveDict()
  headers["Authorization"] = "Basic cmFpbm1hcDpOaXNrdXIrMTQwNA=="
  download = req.get(url, headers=headers)
  csv = download.content.decode('utf-8')
  return csv

# **Download (Here)**

**Initialization**

In [3]:
file_format = ".csv"
folder_data = "raw/"

Creating Folder for Datas

In [4]:
if not os.path.isdir(folder_data):
  os.mkdir(folder_data)
  print("Folder created!")
else:
  print("Folder already exist!")

Folder created!


**Requirements (Change Here)**

In [5]:
dates = ["20211031","20211231","20220131","20220331","20220531"]
lat = "-3.44"
lon = "114.75"

**Downloading Data**

In [6]:
for date in tqdm_notebook(dates):
  html = get_html(date, lat, lon)
  parser = bs(html, "html.parser")
  data = parser.find("div", {"id":"graph_dl"})
  link = data.find("a")["href"]
  result = download_data(link)
  raw = csv.reader(result.splitlines(), delimiter = ",")
  rows = list(raw)
  file = open(folder_data + date + file_format,"w")
  writer = csv.writer(file)
  for row in rows:
    date_time = row[0]
    value = row[1]
    writer.writerow([date_time, value])
  file.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/5 [00:00<?, ?it/s]

# **Merge**

**Merging Data**

In [7]:
csvs = [folder_data + file_name + file_format for file_name in dates]

In [8]:
csvs

['raw/20211031.csv',
 'raw/20211231.csv',
 'raw/20220131.csv',
 'raw/20220331.csv',
 'raw/20220531.csv']

In [9]:
df = pd.concat(map(pd.read_csv, csvs), ignore_index=True)

**Check Duplicate Date**

In [10]:
duplicate_checked = df["Date(yyyymmdd)"].duplicated().sum()

if duplicate_checked > 0:
  print(duplicate_checked, "duplicate records found!")
  df = df[~df["Date(yyyymmdd)"].duplicated()]
  print(duplicate_checked, "duplicate records have been removed!")
else:
  print("No duplicate record found!")

40 duplicate records found!
40 duplicate records have been removed!


**Remove Unused Date**

In [11]:
start_year = dates[0][:4]
start_month = dates[0][4:6]
start_date = "00"
date_start = start_year + start_month + start_date

In [12]:
df_used = df[df["Date(yyyymmdd)"] > int(date_start)]

# **Save File**

**File Path and Name**

In [13]:
path_cleaned = "merged/"

start_date = df_used["Date(yyyymmdd)"].iloc[0]
end_date = df_used["Date(yyyymmdd)"].iloc[-1]


merged_name = str(start_date) + "-" + str(end_date)

**Check Folder Existing**

In [14]:
if not os.path.isdir(path_cleaned):
  os.mkdir(path_cleaned)
  print("Folder created!")
else:
  print("Folder already exist!")

Folder created!


**Save to CSV**

In [15]:
df_used.to_csv(path_cleaned + merged_name + file_format, index = False)
print("You can download the final file at", path_cleaned + merged_name + file_format)

You can download the final file at merged/20211001-20220531.csv


# **Preview**

In [16]:
df_used.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243 entries, 32 to 314
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date(yyyymmdd)  243 non-null    int64  
 1   Rain[mm/day]    243 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 5.7 KB


In [17]:
df_used

Unnamed: 0,Date(yyyymmdd),Rain[mm/day]
32,20211001,0.43
33,20211002,14.19
34,20211003,18.28
35,20211004,0.28
36,20211005,7.75
...,...,...
310,20220527,0.85
311,20220528,1.96
312,20220529,0.74
313,20220530,0.00
