In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'..\..\Weather Data\weather_hourly_active.csv')

In [3]:
df.columns

Index(['DATE', 'HOUR', 'STATION', 'MGRS', 'LAT', 'LONG', 'ELEV', 'WIND_ANGLE',
       'WIND_SPEED', 'SKY_OBS', 'VISIBILITY_DIST', 'AIR_TEMP', 'AIR_TEMP_DEW',
       'ATM_PRESSURE', 'YEAR', 'MONTH', 'DAY', 'ACTIVE'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,DATE,HOUR,STATION,MGRS,LAT,LONG,ELEV,WIND_ANGLE,WIND_SPEED,SKY_OBS,VISIBILITY_DIST,AIR_TEMP,AIR_TEMP_DEW,ATM_PRESSURE,YEAR,MONTH,DAY,ACTIVE
0,2013-01-01,0,690150-93121,11SNT,34.3,-116.166,696.0,340.0,3.6,22000.0,16093.0,7.2,-11.7,1023.1,2013.0,1.0,1.0,False
1,2013-01-01,0,720165-99999,11SNT,34.264,-116.854,2057.0,105.0,1.366667,22000.0,16093.0,0.0,-16.0,,2013.0,1.0,1.0,False
2,2013-01-01,0,720267-23224,10SFJ,38.955,-121.081,467.75,270.0,0.775,22000.0,16093.0,6.5,2.0,,2013.0,1.0,1.0,False
3,2013-01-01,0,720406-99999,10SEH,38.15,-122.55,1.0,130.0,0.5,22000.0,16093.0,10.0,-1.0,,2013.0,1.0,1.0,False
4,2013-01-01,0,720935-99999,10SEH,38.25,-122.6,27.0,275.0,1.733333,7953.333333,16093.0,10.0,2.666667,,2013.0,1.0,1.0,False


In [5]:
# need to select either random hour in the day or average of several hours
# time is in UTC, need to convert to PST

In [6]:
df.HOUR.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23], dtype=int64)

In [7]:
# convert to HH:MM:SS rather than integer hour
df.HOUR.astype('timedelta64[h]')

0         0 days 00:00:00
1         0 days 00:00:00
2         0 days 00:00:00
3         0 days 00:00:00
4         0 days 00:00:00
                ...      
9881565   0 days 23:00:00
9881566   0 days 23:00:00
9881567   0 days 23:00:00
9881568   0 days 23:00:00
9881569   0 days 23:00:00
Name: HOUR, Length: 9881570, dtype: timedelta64[ns]

In [8]:
# combine date and hour
df['DT'] = pd.to_datetime(df.DATE) + df.HOUR.astype('timedelta64[h]')

In [9]:
df['DT'] = pd.to_datetime(df['DT'], utc=True)

In [10]:
# convert to PST, this is where our weather data was collected
df['DTP'] = df['DT'].dt.tz_convert('US/Pacific')

In [11]:
df.head()

Unnamed: 0,DATE,HOUR,STATION,MGRS,LAT,LONG,ELEV,WIND_ANGLE,WIND_SPEED,SKY_OBS,VISIBILITY_DIST,AIR_TEMP,AIR_TEMP_DEW,ATM_PRESSURE,YEAR,MONTH,DAY,ACTIVE,DT,DTP
0,2013-01-01,0,690150-93121,11SNT,34.3,-116.166,696.0,340.0,3.6,22000.0,16093.0,7.2,-11.7,1023.1,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00
1,2013-01-01,0,720165-99999,11SNT,34.264,-116.854,2057.0,105.0,1.366667,22000.0,16093.0,0.0,-16.0,,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00
2,2013-01-01,0,720267-23224,10SFJ,38.955,-121.081,467.75,270.0,0.775,22000.0,16093.0,6.5,2.0,,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00
3,2013-01-01,0,720406-99999,10SEH,38.15,-122.55,1.0,130.0,0.5,22000.0,16093.0,10.0,-1.0,,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00
4,2013-01-01,0,720935-99999,10SEH,38.25,-122.6,27.0,275.0,1.733333,7953.333333,16093.0,10.0,2.666667,,2013.0,1.0,1.0,False,2013-01-01 00:00:00+00:00,2012-12-31 16:00:00-08:00


In [12]:
# choose 1:00 PM for simplicity, created a mid day dataframe
dfmid = df[df['DTP'].dt.hour == 13]

In [13]:
dfmid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416538 entries, 2669 to 9881335
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype                     
---  ------           --------------   -----                     
 0   DATE             416538 non-null  object                    
 1   HOUR             416538 non-null  int64                     
 2   STATION          416538 non-null  object                    
 3   MGRS             416538 non-null  object                    
 4   LAT              416538 non-null  float64                   
 5   LONG             416538 non-null  float64                   
 6   ELEV             416015 non-null  float64                   
 7   WIND_ANGLE       330386 non-null  float64                   
 8   WIND_SPEED       411207 non-null  float64                   
 9   SKY_OBS          310135 non-null  float64                   
 10  VISIBILITY_DIST  312774 non-null  float64                   
 11  AIR_TEMP         39628

In [14]:
# filling 0 becasue much of the wind data is nan where there was no wind
dfmid.fillna(0, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfmid.fillna(0, inplace= True)


In [15]:
dfmid['ACTIVE'].value_counts()

False    410794
True       5744
Name: ACTIVE, dtype: int64

In [16]:
# dates with active fires
dfmid_act = dfmid[dfmid['ACTIVE'] ==1]

In [17]:
dfmid_act.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5744 entries, 168815 to 9638695
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype                     
---  ------           --------------  -----                     
 0   DATE             5744 non-null   object                    
 1   HOUR             5744 non-null   int64                     
 2   STATION          5744 non-null   object                    
 3   MGRS             5744 non-null   object                    
 4   LAT              5744 non-null   float64                   
 5   LONG             5744 non-null   float64                   
 6   ELEV             5744 non-null   float64                   
 7   WIND_ANGLE       5744 non-null   float64                   
 8   WIND_SPEED       5744 non-null   float64                   
 9   SKY_OBS          5744 non-null   float64                   
 10  VISIBILITY_DIST  5744 non-null   float64                   
 11  AIR_TEMP         5744 non-null   fl

In [18]:
# non active, choose 7,000 to balance dataset
dfmid_non = dfmid[dfmid['ACTIVE'] == 0].sample(n=7000)

In [19]:
dfmid_non.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 9085293 to 9516641
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype                     
---  ------           --------------  -----                     
 0   DATE             7000 non-null   object                    
 1   HOUR             7000 non-null   int64                     
 2   STATION          7000 non-null   object                    
 3   MGRS             7000 non-null   object                    
 4   LAT              7000 non-null   float64                   
 5   LONG             7000 non-null   float64                   
 6   ELEV             7000 non-null   float64                   
 7   WIND_ANGLE       7000 non-null   float64                   
 8   WIND_SPEED       7000 non-null   float64                   
 9   SKY_OBS          7000 non-null   float64                   
 10  VISIBILITY_DIST  7000 non-null   float64                   
 11  AIR_TEMP         7000 non-null   f

In [20]:
# join the active and non active datasets, to create a dataset for logistic regression (LR)
df_LR = pd.concat([dfmid_act, dfmid_non])

In [21]:
df_LR.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12744 entries, 168815 to 9516641
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype                     
---  ------           --------------  -----                     
 0   DATE             12744 non-null  object                    
 1   HOUR             12744 non-null  int64                     
 2   STATION          12744 non-null  object                    
 3   MGRS             12744 non-null  object                    
 4   LAT              12744 non-null  float64                   
 5   LONG             12744 non-null  float64                   
 6   ELEV             12744 non-null  float64                   
 7   WIND_ANGLE       12744 non-null  float64                   
 8   WIND_SPEED       12744 non-null  float64                   
 9   SKY_OBS          12744 non-null  float64                   
 10  VISIBILITY_DIST  12744 non-null  float64                   
 11  AIR_TEMP         12744 non-null  f

In [22]:
df_LR['ACTIVE'].value_counts()

False    7000
True     5744
Name: ACTIVE, dtype: int64

In [23]:
# https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a
# good reference material

In [24]:
df_LR.head()

Unnamed: 0,DATE,HOUR,STATION,MGRS,LAT,LONG,ELEV,WIND_ANGLE,WIND_SPEED,SKY_OBS,VISIBILITY_DIST,AIR_TEMP,AIR_TEMP_DEW,ATM_PRESSURE,YEAR,MONTH,DAY,ACTIVE,DT,DTP
168815,2013-02-24,21,999999-53139,11SMA,36.602,-117.145,26.0,0.0,5.8,0.0,0.0,0.0,0.0,0.0,2013.0,2.0,24.0,True,2013-02-24 21:00:00+00:00,2013-02-24 13:00:00-08:00
338522,2013-04-20,20,690150-93121,11SNT,34.3,-116.166,696.0,0.0,2.6,22000.0,16093.0,28.3,-6.7,1014.4,2013.0,4.0,20.0,True,2013-04-20 20:00:00+00:00,2013-04-20 13:00:00-07:00
338523,2013-04-20,20,720165-99999,11SNT,34.264,-116.854,2057.0,50.0,3.766667,22000.0,16093.0,16.333333,-11.0,0.0,2013.0,4.0,20.0,True,2013-04-20 20:00:00+00:00,2013-04-20 13:00:00-07:00
338533,2013-04-20,20,722868-93138,11SNT,33.822,-116.504,137.0,100.0,4.1,22000.0,16093.0,32.8,-7.2,1011.5,2013.0,4.0,20.0,True,2013-04-20 20:00:00+00:00,2013-04-20 13:00:00-07:00
369279,2013-04-30,20,720406-99999,10SEH,38.15,-122.55,1.0,313.333333,5.133333,22000.0,11667.333333,26.666667,4.333333,0.0,2013.0,4.0,30.0,True,2013-04-30 20:00:00+00:00,2013-04-30 13:00:00-07:00


In [25]:
df_LR = df_LR[['AIR_TEMP', 'WIND_SPEED', 'MGRS', 'ACTIVE']]

In [26]:
dum = pd.get_dummies(df_LR['MGRS'])

In [27]:
df_LR = df_LR.join(dum)

In [28]:
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

In [29]:
# encode MGRS location
# le = LabelEncoder()
# df_LR['MGRS_D'] = le.fit_transform(df_LR['MGRS'])

In [30]:
# mix up the data
df_LR = shuffle(df_LR)

In [31]:
df_LR.shape

(12744, 52)

In [32]:
df_LR_train = df_LR[:11444]

In [33]:
df_LR_test = df_LR[11444:] # 10%

In [34]:
df_LR_train['ACTIVE'].value_counts()

False    6286
True     5158
Name: ACTIVE, dtype: int64

In [35]:
df_LR_test['ACTIVE'].value_counts() # nice mix

False    714
True     586
Name: ACTIVE, dtype: int64

In [36]:
X = df_LR_train.drop(['ACTIVE', 'MGRS'], axis=1)
y = df_LR_train['ACTIVE']

In [37]:
LR = LogisticRegression().fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [38]:
# may need to try a diff optimizer
# https://stackoverflow.com/questions/62658215/convergencewarning-lbfgs-failed-to-converge-status-1-stop-total-no-of-iter

In [39]:
LR = LogisticRegression(solver='lbfgs', max_iter=1000).fit(X, y)

In [40]:
x_test = df_LR_test.drop(['ACTIVE', 'MGRS'], axis=1)
y_test = df_LR_test['ACTIVE']

In [41]:
x_test.head()

Unnamed: 0,AIR_TEMP,WIND_SPEED,10SDH,10SDJ,10SEF,10SEG,10SEH,10SEK,10SFF,10SFG,...,11SMT,11SMU,11SMV,11SNR,11SNS,11SNT,11SNU,11SPS,11SQT,11SQU
3916679,18.35,5.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181155,20.0,3.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
813252,29.4,4.1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8119832,36.1,1.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1631752,25.6,6.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
LR.predict(x_test)

array([False, False,  True, ..., False,  True, False])

In [43]:
y_test.values

array([False, False,  True, ..., False,  True, False])

In [44]:
score = LR.score(x_test, y_test)
print(score) 
# %72 is ok, though the test data should be more similar to real world rather than 50/50 mix

0.7215384615384616


In [45]:
# may want to increase what an active is defined as verse non active
LR.predict_proba(x_test)

array([[0.55891833, 0.44108167],
       [0.50336226, 0.49663774],
       [0.23632689, 0.76367311],
       ...,
       [0.71675979, 0.28324021],
       [0.29039203, 0.70960797],
       [0.59227326, 0.40772674]])

In [46]:
# try some other solvers
# https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-definitions/52388406#52388406

In [47]:
LR = LogisticRegression(solver='saga', max_iter=1000).fit(X, y)

In [48]:
score = LR.score(x_test, y_test)
print(score)
# very similar

0.7207692307692307
