## More Preprocessing
Dropping columns that are null, and overwriting clean files in s3

In [2]:
!pip install -q --upgrade "s3fs>=2023.6.0"

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.38.7 requires botocore==1.37.7, but you have botocore 1.37.1 which is incompatible.
boto3 1.37.7 requires botocore<1.38.0,>=1.37.7, but you have botocore 1.37.1 which is incompatible.[0m[31m
[0m

### Load Train, Val, Test in Chunks from S3

In [3]:
import pandas as pd
from tqdm import tqdm

# Paths
base_path = "s3://fireguarddata/data/preprocessed_data/"
paths = {
    "train": f"{base_path}train.csv",
    "val": f"{base_path}val.csv",
    "test": f"{base_path}test.csv"
}

# Function to load in chunks
def load_csv_in_chunks(path, chunksize= 1_000_000):
    chunks = []
    for chunk in tqdm(pd.read_csv(path, chunksize=chunksize, storage_options={"anon": False}), desc=f"Loading {path}"):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

# Load
train_df = load_csv_in_chunks(paths["train"])
val_df = load_csv_in_chunks(paths["val"])
test_df = load_csv_in_chunks(paths["test"])

Loading s3://fireguarddata/data/preprocessed_data/train.csv: 65it [13:02, 12.03s/it]
Loading s3://fireguarddata/data/preprocessed_data/val.csv: 9it [01:36, 10.74s/it]
Loading s3://fireguarddata/data/preprocessed_data/test.csv: 9it [01:32, 10.26s/it]


### Inspecting Current Columns


In [4]:
print("Columns in Train:")
print(train_df.columns.tolist())

print("\nColumns in Val:")
print(val_df.columns.tolist())

print("\nColumns in Test:")
print(test_df.columns.tolist())

Columns in Train:
['latitude', 'longitude', 'u10', 'v10', 'sp', 'lai_hv', 'lai_lv', 'tvl', 'cl', 'swvl1', 'daynight', 'tvh', 'ie', 'd2m', 't2m', 'tcc', 'tcrw', 'rsn', 'sd', 'tsn', 'slt', 'year', 'month', 'day', 'wind_speed', 'wind_direction', 'fuel_load', 'number', 'surface', 'depthBelowLandLayer', 'fire_occurrence']

Columns in Val:
['latitude', 'longitude', 'u10', 'v10', 'sp', 'lai_hv', 'lai_lv', 'tvl', 'cl', 'swvl1', 'daynight', 'tvh', 'ie', 'd2m', 't2m', 'tcc', 'tcrw', 'rsn', 'sd', 'tsn', 'slt', 'year', 'month', 'day', 'wind_speed', 'wind_direction', 'fuel_load', 'number', 'surface', 'depthBelowLandLayer', 'fire_occurrence']

Columns in Test:
['latitude', 'longitude', 'u10', 'v10', 'sp', 'lai_hv', 'lai_lv', 'tvl', 'cl', 'swvl1', 'daynight', 'tvh', 'ie', 'd2m', 't2m', 'tcc', 'tcrw', 'rsn', 'sd', 'tsn', 'slt', 'year', 'month', 'day', 'wind_speed', 'wind_direction', 'fuel_load', 'number', 'surface', 'depthBelowLandLayer', 'fire_occurrence']


### Dropping Columns with Nulls Entirely and Overwrite with cleaned Version
drop columns with nulls
overwrite the current dataset
list all columns present

In [5]:
import s3fs

# Drop columns
cols_to_drop = ["number", "surface", "depthBelowLandLayer"]

train_df.drop(columns=cols_to_drop, inplace=True, errors="ignore")
val_df.drop(columns=cols_to_drop, inplace=True, errors="ignore")
test_df.drop(columns=cols_to_drop, inplace=True, errors="ignore")

# Check for any NaNs
print("Missing values check:")
print("Train NaNs:", train_df.isnull().sum().sum())
print("Val NaNs:", val_df.isnull().sum().sum())
print("Test NaNs:", test_df.isnull().sum().sum())

# Save cleaned versions back to S3
fs = s3fs.S3FileSystem(anon=False)

print("\n Saving cleaned files back to S3...")

train_df.to_csv(fs.open(paths["train"], "w"), index=False)
val_df.to_csv(fs.open(paths["val"], "w"), index=False)
test_df.to_csv(fs.open(paths["test"], "w"), index=False)

print("All cleaned files saved successfully.")


Missing values check:
Train NaNs: 0
Val NaNs: 0
Test NaNs: 0

 Saving cleaned files back to S3...
All cleaned files saved successfully.


In [6]:
print("Columns in Train:")
print(train_df.columns.tolist())

print("\nColumns in Val:")
print(val_df.columns.tolist())

print("\nColumns in Test:")
print(test_df.columns.tolist())

Columns in Train:
['latitude', 'longitude', 'u10', 'v10', 'sp', 'lai_hv', 'lai_lv', 'tvl', 'cl', 'swvl1', 'daynight', 'tvh', 'ie', 'd2m', 't2m', 'tcc', 'tcrw', 'rsn', 'sd', 'tsn', 'slt', 'year', 'month', 'day', 'wind_speed', 'wind_direction', 'fuel_load', 'fire_occurrence']

Columns in Val:
['latitude', 'longitude', 'u10', 'v10', 'sp', 'lai_hv', 'lai_lv', 'tvl', 'cl', 'swvl1', 'daynight', 'tvh', 'ie', 'd2m', 't2m', 'tcc', 'tcrw', 'rsn', 'sd', 'tsn', 'slt', 'year', 'month', 'day', 'wind_speed', 'wind_direction', 'fuel_load', 'fire_occurrence']

Columns in Test:
['latitude', 'longitude', 'u10', 'v10', 'sp', 'lai_hv', 'lai_lv', 'tvl', 'cl', 'swvl1', 'daynight', 'tvh', 'ie', 'd2m', 't2m', 'tcc', 'tcrw', 'rsn', 'sd', 'tsn', 'slt', 'year', 'month', 'day', 'wind_speed', 'wind_direction', 'fuel_load', 'fire_occurrence']


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8054293 entries, 0 to 8054292
Data columns (total 28 columns):
 #   Column           Dtype  
---  ------           -----  
 0   latitude         float64
 1   longitude        float64
 2   u10              float64
 3   v10              float64
 4   sp               float64
 5   lai_hv           float64
 6   lai_lv           float64
 7   tvl              float64
 8   cl               float64
 9   swvl1            float64
 10  daynight         float64
 11  tvh              float64
 12  ie               float64
 13  d2m              float64
 14  t2m              float64
 15  tcc              float64
 16  tcrw             float64
 17  rsn              float64
 18  sd               float64
 19  tsn              float64
 20  slt              float64
 21  year             float64
 22  month            float64
 23  day              float64
 24  wind_speed       float64
 25  wind_direction   float64
 26  fuel_load        float64
 27  fire_occurre

In [12]:
train_df.dtypes

latitude           float64
longitude          float64
u10                float64
v10                float64
sp                 float64
lai_hv             float64
lai_lv             float64
tvl                float64
cl                 float64
swvl1              float64
daynight           float64
tvh                float64
ie                 float64
d2m                float64
t2m                float64
tcc                float64
tcrw               float64
rsn                float64
sd                 float64
tsn                float64
slt                float64
year               float64
month              float64
day                float64
wind_speed         float64
wind_direction     float64
fuel_load          float64
fire_occurrence      int64
dtype: object

### Checking memory usage
comparing float 64 vs float 32

In [14]:
train_df.memory_usage(deep=True).sum()

14433292736

In [17]:
train_df= train_df.astype('float32')

In [18]:
train_df.memory_usage(deep=True).sum()

7216646432

In [19]:
train_df.shape

(64434342, 28)

### Updating Fire_occurence threshold to 320 for day and 330 for night