# Pre processing
#### In this script, crowdsourced data is loaded, time and spatial coordinates are formatted

In [1]:
#Load packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import xarray as xr
import datetime as dt
import cartopy.crs as ccrs
import geopandas as gpd
import cartopy.feature as cf

from climada import CONFIG
import scClim as sc
import seaborn as sns

data_dir = str(CONFIG.local_data.data_dir)
out_dir = str(CONFIG.local_data.out_dir)

In [14]:
#Load raw data
crowdsourcing_path = f'{data_dir}/crowd-source/crowd_consolidated_2015-05-02_2023-10-15.csv'
str_cols = ['OsVersion','AppVersion','Language','size_text','size_text_new']
crowd_data = pd.read_csv(crowdsourcing_path,dtype={v:str for v in str_cols})

#Remove reports with suspicious reporting behaviour as defined by Barras et al (2019) and extended by the 4-reports criterion by Kopp et al (2024)
cond=(crowd_data['Flag_30min']==0) & (crowd_data['Flag_blacklist']==0) & (crowd_data['Flag_N_day_ID_4']==0)
print(f"Removing reports based on suspicious user behaviour. Total: {(~cond).sum()} reports: {(~cond).sum()/crowd_data.shape[0]*100:.2f}%")
crowd_data = crowd_data.loc[cond]

#For now only apply basic filtering based on the CZC flag (CZC>35)
crowd_data['FILTEREDOUT'] = crowd_data['Flag_CZC']

#Add time and hour columns
crowd_data["time_dt"] = pd.to_datetime(crowd_data.Time).dt.tz_localize(None) # Remove timezone information (UTC)
crowd_data['hailday'] = (crowd_data["time_dt"] - pd.Timedelta(hours=6)).dt.date  #use -6h, to get haildays correctly! (1 hailday: 6UTC-6UTC)

#Add columns with time of day
crowd_data['hour_int'] = crowd_data.time_dt.dt.hour
crowd_data["hour"] = crowd_data.time_dt.dt.hour + crowd_data.time_dt.dt.minute/60

# Convert hail size codes to corresponding columns: size_mm, size_text, size_mm_text
crowd_data = sc.crowd_process.map_hailsize(crowd_data)

Removing reports based on suspicious user behaviour. Total: 16511 reports: 4.75%


In [16]:
#Convert x and y coordinates to Swiss coordinates (EPSG 2056)
geometry = gpd.points_from_xy(crowd_data.x, crowd_data.y,crs='epsg:21781').to_crs(epsg=2056)
crowd_data['chx'] = geometry.x
crowd_data['chy'] = geometry.y

In [17]:
#Number of reports starting July 2027
print(f"Total reports (excluding 'no hail'):{(~crowd_data.no_hail).sum()}")
print(f"Total reports after July 2020:{(~crowd_data.no_hail & (crowd_data.hailday>dt.date(2020,7,1))).sum()}")
print(f"Fraction of reports after July 2020:{(~crowd_data.no_hail & (crowd_data.hailday>dt.date(2020,7,1))).sum()/(~crowd_data.no_hail).sum():.1%}")

Total reports (excluding 'no hail'):260099
Total reports after July 2020:214710
Fraction of reports after July 2020:82.5%


In [18]:
#Save data to csv
crowd_data.to_csv(f'{data_dir}/crowd-source/crowd_processed_paper.csv',index=False)