In [129]:
import pandas as pd

daily_data = pd.read_csv('daily_data.csv')
# submission = pd.read_csv('submission.csv')

In [130]:
daily_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2893 entries, 0 to 2892
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_id                    2893 non-null   object 
 1   city_id                   2893 non-null   object 
 2   temperature_celsius       2893 non-null   float64
 3   condition_text            479 non-null    object 
 4   wind_kph                  2893 non-null   float64
 5   wind_degree               2893 non-null   int64  
 6   pressure_mb               2893 non-null   float64
 7   precip_mm                 2893 non-null   float64
 8   humidity                  2893 non-null   int64  
 9   cloud                     2893 non-null   int64  
 10  feels_like_celsius        2893 non-null   float64
 11  visibility_km             2893 non-null   float64
 12  uv_index                  2893 non-null   float64
 13  gust_kph                  2893 non-null   float64
 14  air_qual

In [131]:
daily_data.head()

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise,sunset
0,D0001,C001,27.0,,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,06:04 AM,07:19 PM
1,D0002,C001,22.0,,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,06:05 AM,07:18 PM
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,06:05 AM,07:18 PM
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,06:06 AM,07:16 PM
4,D0005,C001,18.0,,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,06:07 AM,07:15 PM


In [132]:
def time_to_minutes(time_str):
    time_parts = pd.to_datetime(time_str, format='%I:%M %p')
    return time_parts.hour * 60 + time_parts.minute

# Apply the conversion to 'sunrise' and 'sunset' columns
daily_data['sunrise_minutes'] = daily_data['sunrise'].apply(time_to_minutes)
daily_data['sunset_minutes'] = daily_data['sunset'].apply(time_to_minutes)

# Drop the original 'sunrise' and 'sunset' columns
daily_data = daily_data.drop(['sunrise', 'sunset'], axis=1)

In [133]:
# Assuming `data` is your DataFrame
filtered_data = daily_data.dropna(subset=['condition_text'])

filtered_data.head()

Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise_minutes,sunset_minutes
2,D0003,C001,20.0,Light Rain with Thunder,3.6,10,1011.0,4.5,100,75,20.0,10.0,1.0,12.6,1,365,1158
3,D0004,C001,17.0,Clear and Sunny,6.1,150,1018.0,0.0,88,0,17.0,10.0,1.0,11.2,1,366,1156
6,D0007,C001,21.0,Partly Cloudy,4.0,310,1015.0,0.0,100,50,21.0,10.0,1.0,15.1,2,368,1151
18,D0019,C001,19.0,Clear and Sunny,3.6,64,1017.0,0.0,88,0,19.0,10.0,1.0,8.3,3,380,1131
27,D0028,C002,19.0,Partly Cloudy,3.6,83,1010.0,0.0,73,25,19.0,10.0,1.0,8.3,1,377,1160


In [134]:
# Filter out rows where 'condition_text' is not null for training
train_data = daily_data[daily_data['condition_text'].notnull()]

# Separate rows where 'condition_text' is null for prediction
test_data = daily_data[daily_data['condition_text'].isnull()]
print(test_data)

     day_id city_id  temperature_celsius condition_text  wind_kph  \
0     D0001    C001                 27.0            NaN       6.1   
1     D0002    C001                 22.0            NaN       6.1   
4     D0005    C001                 18.0            NaN       3.6   
5     D0006    C001                 20.0            NaN       3.6   
7     D0008    C001                 21.0            NaN      20.2   
...     ...     ...                  ...            ...       ...   
2887  D2888    C112                 19.5            NaN      16.6   
2889  D2890    C112                 17.4            NaN      13.0   
2890  D2891    C112                 19.2            NaN      11.5   
2891  D2892    C112                 19.2            NaN      14.4   
2892  D2893    C112                 18.6            NaN      17.3   

      wind_degree  pressure_mb  precip_mm  humidity  cloud  \
0             210       1006.0        0.0        54     75   
1             170       1006.0        0.0      

In [135]:
features = [col for col in daily_data.columns if col not in ['day_id','city_id', 'condition_text']]
print(features)

['temperature_celsius', 'wind_kph', 'wind_degree', 'pressure_mb', 'precip_mm', 'humidity', 'cloud', 'feels_like_celsius', 'visibility_km', 'uv_index', 'gust_kph', 'air_quality_us-epa-index', 'sunrise_minutes', 'sunset_minutes']


In [136]:
X_train = train_data[features]
y_train = train_data['condition_text']

X_test = test_data[features]
print(X_test)

      temperature_celsius  wind_kph  wind_degree  pressure_mb  precip_mm  \
0                    27.0       6.1          210       1006.0        0.0   
1                    22.0       6.1          170       1006.0        0.0   
4                    18.0       3.6           92       1019.0        0.0   
5                    20.0       3.6           96       1019.0        0.0   
7                    21.0      20.2          330       1011.0        0.0   
...                   ...       ...          ...          ...        ...   
2887                 19.5      16.6          113       1015.0        0.0   
2889                 17.4      13.0           51       1017.0        0.0   
2890                 19.2      11.5           46       1016.0        0.0   
2891                 19.2      14.4           76       1017.0        0.0   
2892                 18.6      17.3           61       1016.0        0.0   

      humidity  cloud  feels_like_celsius  visibility_km  uv_index  gust_kph  \
0      

In [148]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Split the training data for evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [149]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [150]:
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_split, y_train_split)

# Get the best model from the grid search
best_model = grid_search.best_estimator_

In [165]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier 

clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf2 = AdaBoostClassifier(n_estimators=100, random_state=42)
clf3 = GradientBoostingClassifier(n_estimators=100, random_state=42)

ensemble_clf = VotingClassifier(estimators=[('rf', clf1), ('adaboost', clf2), ('gb', clf3)], voting='soft')
ensemble_clf.fit(X_train, y_train)

y_val_pred = ensemble_clf.predict(X_val_split)




In [166]:
# Evaluate the best model on validation set
# y_val_pred = best_model.predict(X_val_split)
print(classification_report(y_val_split, y_val_pred))
print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred))

                         precision    recall  f1-score   support

        Clear and Sunny       1.00      1.00      1.00        22
    Cloudy and Overcast       1.00      1.00      1.00        12
    Light Precipitation       1.00      1.00      1.00        12
Light Rain with Thunder       1.00      1.00      1.00         5
            Mist or Fog       1.00      1.00      1.00         8
 Moderate to Heavy Rain       1.00      1.00      1.00         4
          Partly Cloudy       1.00      1.00      1.00        26
           Rain Showers       1.00      1.00      1.00         5
          Thunderstorms       1.00      1.00      1.00         2

               accuracy                           1.00        96
              macro avg       1.00      1.00      1.00        96
           weighted avg       1.00      1.00      1.00        96

Validation Accuracy: 1.0


In [164]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(ensemble_clf, X_train, y_train, cv=5)
print("Cross-validated Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))




Cross-validated Accuracy: 0.68 (+/- 0.04)


In [None]:
# # Initialize and train the model
# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train_split, y_train_split)

# # Evaluate the model
# y_val_pred = model.predict(X_val_split)
# print(classification_report(y_val_split, y_val_pred))
# print("Validation Accuracy:", accuracy_score(y_val_split, y_val_pred))

# print(X_test)

In [168]:
test_data['condition_text'] = ensemble_clf.predict(X_test)

test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['condition_text'] = ensemble_clf.predict(X_test)


Unnamed: 0,day_id,city_id,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise_minutes,sunset_minutes
0,D0001,C001,27.0,Partly Cloudy,6.1,210,1006.0,0.0,54,75,28.0,10.0,6.0,11.9,2,364,1159
1,D0002,C001,22.0,Partly Cloudy,6.1,170,1006.0,0.0,73,75,24.5,10.0,1.0,23.4,1,365,1158
4,D0005,C001,18.0,Clear and Sunny,3.6,92,1019.0,0.0,94,0,18.0,10.0,1.0,9.0,1,367,1155
5,D0006,C001,20.0,Clear and Sunny,3.6,96,1019.0,0.0,88,0,20.0,10.0,1.0,11.2,1,367,1153
7,D0008,C001,21.0,Partly Cloudy,20.2,330,1011.0,0.0,53,75,21.0,10.0,1.0,17.3,1,369,1150


In [169]:
combined_df = pd.concat([train_data, test_data], axis=0)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2893 entries, 2 to 2892
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_id                    2893 non-null   object 
 1   city_id                   2893 non-null   object 
 2   temperature_celsius       2893 non-null   float64
 3   condition_text            2893 non-null   object 
 4   wind_kph                  2893 non-null   float64
 5   wind_degree               2893 non-null   int64  
 6   pressure_mb               2893 non-null   float64
 7   precip_mm                 2893 non-null   float64
 8   humidity                  2893 non-null   int64  
 9   cloud                     2893 non-null   int64  
 10  feels_like_celsius        2893 non-null   float64
 11  visibility_km             2893 non-null   float64
 12  uv_index                  2893 non-null   float64
 13  gust_kph                  2893 non-null   float64
 14  air_qual

In [170]:
combined_df = combined_df.sort_values(by='day_id')
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2893 entries, 0 to 2892
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   day_id                    2893 non-null   object 
 1   city_id                   2893 non-null   object 
 2   temperature_celsius       2893 non-null   float64
 3   condition_text            2893 non-null   object 
 4   wind_kph                  2893 non-null   float64
 5   wind_degree               2893 non-null   int64  
 6   pressure_mb               2893 non-null   float64
 7   precip_mm                 2893 non-null   float64
 8   humidity                  2893 non-null   int64  
 9   cloud                     2893 non-null   int64  
 10  feels_like_celsius        2893 non-null   float64
 11  visibility_km             2893 non-null   float64
 12  uv_index                  2893 non-null   float64
 13  gust_kph                  2893 non-null   float64
 14  air_qual

In [171]:
submission = combined_df[['day_id', 'condition_text']]

# Save to CSV
submission.to_csv('submission4.csv', index=False)