In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'..\..\Weather Data\weather_hourly_active.csv')

In [3]:
df.columns

Index(['DATE', 'HOUR', 'STATION', 'MGRS', 'LAT', 'LONG', 'ELEV', 'WIND_ANGLE',
       'WIND_SPEED', 'SKY_OBS', 'VISIBILITY_DIST', 'AIR_TEMP', 'AIR_TEMP_DEW',
       'ATM_PRESSURE', 'YEAR', 'MONTH', 'DAY', 'ACTIVE'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,DATE,HOUR,STATION,MGRS,LAT,LONG,ELEV,WIND_ANGLE,WIND_SPEED,SKY_OBS,VISIBILITY_DIST,AIR_TEMP,AIR_TEMP_DEW,ATM_PRESSURE,YEAR,MONTH,DAY,ACTIVE
0,2013-01-01,0,690150-93121,11SNT,34.3,-116.166,696.0,340.0,3.6,22000.0,16093.0,7.2,-11.7,1023.1,2013.0,1.0,1.0,False
1,2013-01-01,0,720165-99999,11SNT,34.264,-116.854,2057.0,105.0,1.366667,22000.0,16093.0,0.0,-16.0,,2013.0,1.0,1.0,False
2,2013-01-01,0,720267-23224,10SFJ,38.955,-121.081,467.75,270.0,0.775,22000.0,16093.0,6.5,2.0,,2013.0,1.0,1.0,False
3,2013-01-01,0,720406-99999,10SEH,38.15,-122.55,1.0,130.0,0.5,22000.0,16093.0,10.0,-1.0,,2013.0,1.0,1.0,False
4,2013-01-01,0,720935-99999,10SEH,38.25,-122.6,27.0,275.0,1.733333,7953.333333,16093.0,10.0,2.666667,,2013.0,1.0,1.0,False


In [5]:
# need to select either random hour in the day or average of several hours
# time is in UTC, need to convert to PST

In [6]:
df.HOUR.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23], dtype=int64)

In [7]:
# convert to HH:MM:SS rather than integer hour
df.HOUR.astype('timedelta64[h]')

0         0 days 00:00:00
1         0 days 00:00:00
2         0 days 00:00:00
3         0 days 00:00:00
4         0 days 00:00:00
                ...      
9881565   0 days 23:00:00
9881566   0 days 23:00:00
9881567   0 days 23:00:00
9881568   0 days 23:00:00
9881569   0 days 23:00:00
Name: HOUR, Length: 9881570, dtype: timedelta64[ns]

In [8]:
# combine date and hour
df['DT'] = pd.to_datetime(df.DATE) + df.HOUR.astype('timedelta64[h]')

In [9]:
df['DT'] = pd.to_datetime(df['DT'], utc=True)

In [10]:
# convert to PST, this is where our weather data was collected
df['DTP'] = df['DT'].dt.tz_convert('US/Pacific')

In [11]:
df.head()

Unnamed: 0,DATE,HOUR,STATION,MGRS,LAT,LONG,ELEV,WIND_ANGLE,WIND_SPEED,SKY_OBS,VISIBILITY_DIST,AIR_TEMP,AIR_TEMP_DEW,ATM_PRESSURE,YEAR,MONTH,DAY,ACTIVE,DT,DTP
0,2013-01-01,0,690150-93121,11SNT,34.3,-116.166,696.0,340.0,3.6,22000.0,16093.0,7.2,-11.7,1023.1,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00
1,2013-01-01,0,720165-99999,11SNT,34.264,-116.854,2057.0,105.0,1.366667,22000.0,16093.0,0.0,-16.0,,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00
2,2013-01-01,0,720267-23224,10SFJ,38.955,-121.081,467.75,270.0,0.775,22000.0,16093.0,6.5,2.0,,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00
3,2013-01-01,0,720406-99999,10SEH,38.15,-122.55,1.0,130.0,0.5,22000.0,16093.0,10.0,-1.0,,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00
4,2013-01-01,0,720935-99999,10SEH,38.25,-122.6,27.0,275.0,1.733333,7953.333333,16093.0,10.0,2.666667,,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00


In [12]:
# choose 1:00 PM for simplicity, created a mid day dataframe
dfmid = df[df['DTP'].dt.hour == 13]

In [13]:
dfmid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416538 entries, 2669 to 9881335
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype                     
---  ------           --------------   -----                     
 0   DATE             416538 non-null  object                    
 1   HOUR             416538 non-null  int64                     
 2   STATION          416538 non-null  object                    
 3   MGRS             416538 non-null  object                    
 4   LAT              416538 non-null  float64                   
 5   LONG             416538 non-null  float64                   
 6   ELEV             416015 non-null  float64                   
 7   WIND_ANGLE       330386 non-null  float64                   
 8   WIND_SPEED       411207 non-null  float64                   
 9   SKY_OBS          310135 non-null  float64                   
 10  VISIBILITY_DIST  312774 non-null  float64                   
 11  AIR_TEMP         39628

In [14]:
# filling 0 becasue much of the wind data is nan where there was no wind
dfmid.fillna(0, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmid.fillna(0, inplace= True)


In [15]:
dfmid['ACTIVE'].value_counts()

False    410794
True       5744
Name: ACTIVE, dtype: int64

In [16]:
# dates with active fires
dfmid_act = dfmid[dfmid['ACTIVE'] ==1]

In [17]:
dfmid_act.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5744 entries, 168815 to 9638695
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype                     
---  ------           --------------  -----                     
 0   DATE             5744 non-null   object                    
 1   HOUR             5744 non-null   int64                     
 2   STATION          5744 non-null   object                    
 3   MGRS             5744 non-null   object                    
 4   LAT              5744 non-null   float64                   
 5   LONG             5744 non-null   float64                   
 6   ELEV             5744 non-null   float64                   
 7   WIND_ANGLE       5744 non-null   float64                   
 8   WIND_SPEED       5744 non-null   float64                   
 9   SKY_OBS          5744 non-null   float64                   
 10  VISIBILITY_DIST  5744 non-null   float64                   
 11  AIR_TEMP         5744 non-null   fl

In [18]:
# non active, choose 7,000 to balance dataset
dfmid_non = dfmid[dfmid['ACTIVE'] == 0].sample(n=7000)

In [19]:
dfmid_non.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 7717461 to 5978402
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype                     
---  ------           --------------  -----                     
 0   DATE             7000 non-null   object                    
 1   HOUR             7000 non-null   int64                     
 2   STATION          7000 non-null   object                    
 3   MGRS             7000 non-null   object                    
 4   LAT              7000 non-null   float64                   
 5   LONG             7000 non-null   float64                   
 6   ELEV             7000 non-null   float64                   
 7   WIND_ANGLE       7000 non-null   float64                   
 8   WIND_SPEED       7000 non-null   float64                   
 9   SKY_OBS          7000 non-null   float64                   
 10  VISIBILITY_DIST  7000 non-null   float64                   
 11  AIR_TEMP         7000 non-null   f

In [20]:
# join the active and non active datasets, to create a dataset for logistic regression (LR)
df_LR = pd.concat([dfmid_act, dfmid_non])

In [21]:
df_LR.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12744 entries, 168815 to 5978402
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype                     
---  ------           --------------  -----                     
 0   DATE             12744 non-null  object                    
 1   HOUR             12744 non-null  int64                     
 2   STATION          12744 non-null  object                    
 3   MGRS             12744 non-null  object                    
 4   LAT              12744 non-null  float64                   
 5   LONG             12744 non-null  float64                   
 6   ELEV             12744 non-null  float64                   
 7   WIND_ANGLE       12744 non-null  float64                   
 8   WIND_SPEED       12744 non-null  float64                   
 9   SKY_OBS          12744 non-null  float64                   
 10  VISIBILITY_DIST  12744 non-null  float64                   
 11  AIR_TEMP         12744 non-null  f

In [22]:
df_LR['ACTIVE'].value_counts()

False    7000
True     5744
Name: ACTIVE, dtype: int64

In [23]:
# https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a
# good reference material

In [24]:
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

In [25]:
# encode MGRS location
le = LabelEncoder()
df_LR['MGRS_D'] = le.fit_transform(df_LR['MGRS'])

In [26]:
# mix up the data
df_LR = shuffle(df_LR)

In [27]:
df_LR.shape

(12744, 21)

In [28]:
df_LR_train = df_LR[:11444]

In [29]:
df_LR_test = df_LR[11444:] # 10%

In [30]:
df_LR_train['ACTIVE'].value_counts()

False    6262
True     5182
Name: ACTIVE, dtype: int64

In [31]:
df_LR_test['ACTIVE'].value_counts() # nice mix

False    738
True     562
Name: ACTIVE, dtype: int64

In [32]:
X = df_LR_train[['AIR_TEMP', 'WIND_SPEED', 'MGRS_D']]
y = df_LR_train['ACTIVE']

In [33]:
LR = LogisticRegression().fit(X, y)

In [34]:
x_test = df_LR_test[['AIR_TEMP', 'WIND_SPEED', 'MGRS_D']]
y_test = df_LR_test['ACTIVE']

In [35]:
x_test

Unnamed: 0,AIR_TEMP,WIND_SPEED,MGRS_D
7326573,21.700000,6.700000,29
8517963,30.333333,1.000000,8
5757302,15.900000,8.800000,4
7169082,23.300000,4.600000,6
4352831,17.900000,4.100000,4
...,...,...,...
3899956,30.000000,2.100000,22
9154321,26.666667,2.766667,9
4098038,21.150000,5.400000,37
3192714,17.058333,1.600000,7


In [36]:
LR.predict(x_test)

array([False,  True, False, ..., False, False, False])

In [37]:
y_test.values

array([False,  True, False, ..., False, False, False])

In [38]:
score = LR.score(x_test, y_test)
print(score) 
# %66 is ok, though the test data should be more similar to real world rather than 50/50 mix

0.6692307692307692


In [39]:
LR.predict_proba(x_test)

array([[0.58839507, 0.41160493],
       [0.39117721, 0.60882279],
       [0.57599981, 0.42400019],
       ...,
       [0.66363231, 0.33636769],
       [0.70501093, 0.29498907],
       [0.77991469, 0.22008531]])