#**Solving the datascience coding challenge**

In [8]:
# Importing the libraries

import bigframes.pandas as bf
import pandas as pd


bf.options.bigquery.location = "EU" #this variable is set based on the dataset you chose to query
bf.options.bigquery.project = "neat-bricolage-425013-d4" #this variable is set based on the dataset you chose to query

In [9]:
#reading the dataset
df = bf.read_gbq("neat-bricolage-425013-d4.challenge_101.challenge_20") #this variable is set based on the dataset you chose to query

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
# overview of the dataset
# Look at the first 20 rows.
df.head(20)

Unnamed: 0,date,station_number,mean_temp,mean_dew_point,mean_sealevel_pressure,mean_visibility,mean_wind_speed,max_temperature,total_precipitation,snow_depth,fog,snow,hail,thunder
0,2008-01-08,725300,54.200001,51.200001,1006.099976,8.3,11.6,41.0,0.39,3.76,False,False,False,False
1,2008-01-20,725315,4.9,-5.7,1034.199951,10.0,8.2,-4.0,0.0,0.0,False,False,False,False
2,2010-03-01,725326,34.5,27.6,17.7,10.0,6.5,30.200001,0.02,0.0,False,False,False,False
3,2009-08-10,725300,78.0,66.0,1014.299988,10.0,7.2,69.800003,0.0,0.0,False,False,False,False
4,2008-05-03,725316,55.0,40.599998,1005.700012,10.0,13.9,48.200001,0.2,0.0,False,False,False,False
5,2007-12-02,725330,42.0,36.299999,1013.700012,7.4,11.8,19.0,0.23,0.0,False,False,False,False
6,2007-01-06,725317,43.900002,40.0,1013.299988,8.3,7.9,35.599998,0.0,0.0,False,False,False,False
7,2006-10-15,725317,44.0,23.200001,1019.0,10.0,4.7,28.9,0.0,0.0,False,False,False,False
8,2006-09-05,725327,64.599998,58.799999,1019.400024,6.8,2.2,59.0,0.0,0.0,False,False,False,False
9,2008-12-18,725314,26.5,21.700001,1026.0,8.5,4.0,21.200001,0.0,0.0,False,False,False,False


In [11]:
# Print the data types
print(df.dtypes)

date                      date32[day][pyarrow]
station_number                           Int64
mean_temp                              Float64
mean_dew_point                         Float64
mean_sealevel_pressure                 Float64
mean_visibility                        Float64
mean_wind_speed                        Float64
max_temperature                        Float64
total_precipitation                    Float64
snow_depth                             Float64
fog                                    boolean
snow                                   boolean
hail                                   boolean
thunder                                boolean
dtype: object


## **Part 1**

### **1. Task**
Change the date format to 'YYYY-MM-DD' and select the data from 2005 till 2009 for station numbers including and between 725300 and 726300 , and save it as a pandas dataframe. Note the maximum year available is 2010.

In [12]:
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

In [13]:
# Verify the date formatting
print(df['date'].head())

0    2008-01-08
1    2008-01-20
2    2010-03-01
3    2009-08-10
4    2008-05-03
Name: date, dtype: string


In [17]:
# Filter data for years 2005 to 2009 and station numbers between 725300 and 726300
df_filtered = df[
    (df['date'] >= '2005-01-01') &
    (df['date'] <= '2010-12-31') &
    (df['station_number'] >= 725300) &
    (df['station_number'] <= 726300)
]

In [20]:
# Display the first 20 rows of the filtered DataFrame
print(df_filtered.head(20))

          date  station_number  mean_temp  mean_dew_point  \
0   2008-01-08          725300  54.200001       51.200001   
1   2008-01-20          725315        4.9            -5.7   
2   2010-03-01          725326       34.5            27.6   
3   2009-08-10          725300       78.0            66.0   
4   2008-05-03          725316       55.0       40.599998   
5   2007-12-02          725330       42.0       36.299999   
6   2007-01-06          725317  43.900002            40.0   
7   2006-10-15          725317       44.0       23.200001   
8   2006-09-05          725327  64.599998       58.799999   
9   2008-12-18          725314       26.5       21.700001   
10  2007-12-10          725316  32.099998            28.9   
11  2006-10-10          725316  60.700001            51.5   
12  2006-05-16          725327  49.900002       48.900002   
13  2009-06-16          725330  68.400002       54.900002   
14  2006-04-25          725317  52.599998       44.799999   
15  2010-03-20          

In [21]:
# Save the filtered DataFrame as a pandas DataFrame
filtered_df = pd.DataFrame(df_filtered)

### **2. Task**
From here you want to work with the data from all stations 725300 to 725330 that have information from 2005 till 2009.

In [22]:
df_station_filtered = df[
    (df['date'] >= '2005-01-01') &
    (df['date'] <= '2009-12-31') &
    (df['station_number'] >= 725300) &
    (df['station_number'] <= 725330)
]

Do a first analysis of the remaining dataset, clean or drop data depending on how you see appropriate.

In [23]:
missing_values = df_station_filtered.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 

date                      0
station_number            0
mean_temp                 0
mean_dew_point            0
mean_sealevel_pressure    0
mean_visibility           0
mean_wind_speed           0
max_temperature           0
total_precipitation       0
snow_depth                0
fog                       0
snow                      0
hail                      0
thunder                   0
dtype: Int64


### **3. Task**
Now it is time to split the data, into a training, evaluation and test set. As a reminder, the date we are trying to predict snow fall for should constitute your test set.

In [32]:
from sklearn.model_selection import train_test_split
from random import shuffle
# Define features (X) and target (y)
# Define features (X) and target (y)
X = df_station_filtered.drop(columns=['snow', 'date'])
y = df_station_filtered['snow']  # Target variable ('snow')


In [34]:
# Convert BigQuery DataFrame to pandas DataFrame if not already done
if not isinstance(X, pd.DataFrame):
    X = X.to_pandas()
if not isinstance(y, pd.Series):
    y = y.to_pandas()

In [38]:
# Check class distribution
print("Class distribution in target variable:")
print(y.value_counts())

Class distribution in target variable:
snow
False    12991
True      1281
Name: count, dtype: Int64


In [35]:
# Split data into training (60%) and temporary (40%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

In [36]:
# Split the temporary set into validation (50% of 40%) and test (50% of 40%) sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [37]:
# Check sizes of datasets
print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")

Training set: (8563, 12), Validation set: (2854, 12), Test set: (2855, 12)


### **Part 2**

If you made it up to here all by yourself, you can use your prepared dataset to train an algorithm of your choice to forecast whether it will snow on the following date for each station in this dataset:

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming X_train, y_train, X_val, y_val, X_test, y_test are already defined and scaled

# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [40]:
# Train the model
model.fit(X_train, y_train)

In [41]:
# Predict on the validation set
y_val_pred = model.predict(X_val)

In [42]:
# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)

In [43]:
print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:\n", val_classification_report)
print("Validation Confusion Matrix:\n", val_confusion_matrix)

Validation Accuracy: 1.0
Validation Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2607
         1.0       1.00      1.00      1.00       247

    accuracy                           1.00      2854
   macro avg       1.00      1.00      1.00      2854
weighted avg       1.00      1.00      1.00      2854

Validation Confusion Matrix:
 [[2607    0]
 [   0  247]]


In [45]:
# Predict on the test set
y_test_pred = model.predict(X_test)

In [46]:
# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)


In [47]:
print("Test Accuracy:", test_accuracy)
print("Test Classification Report:\n", test_classification_report)
print("Test Confusion Matrix:\n", test_confusion_matrix)

Test Accuracy: 1.0
Test Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      2614
         1.0       1.00      1.00      1.00       241

    accuracy                           1.00      2855
   macro avg       1.00      1.00      1.00      2855
weighted avg       1.00      1.00      1.00      2855

Test Confusion Matrix:
 [[2614    0]
 [   0  241]]
