## Web app

## Import Libraries

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from helper_functions import TrainingSet, plot_correlation, plot_scatter
import pickle

## Preprocess Data

### Load Dataframe

In [14]:
df_org = pd.read_csv('./data/ufos.csv')

df_org

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700.0,45 minutes,This event took place in early fall around 194...,4/27/2004,29.883056,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200.0,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.384210,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20.0,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.200000,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20.0,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.978333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900.0,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.418056,-157.803611
...,...,...,...,...,...,...,...,...,...,...,...
80327,9/9/2013 21:15,nashville,tn,us,light,600.0,10 minutes,Round from the distance/slowly changing colors...,9/30/2013,36.165833,-86.784444
80328,9/9/2013 22:00,boise,id,us,circle,1200.0,20 minutes,Boise&#44 ID&#44 spherical&#44 20 min&#44 10 r...,9/30/2013,43.613611,-116.202500
80329,9/9/2013 22:00,napa,ca,us,other,1200.0,hour,Napa UFO&#44,9/30/2013,38.297222,-122.284444
80330,9/9/2013 22:20,vienna,va,us,circle,5.0,5 seconds,Saw a five gold lit cicular craft moving fastl...,9/30/2013,38.901111,-77.265556


### Clean up Dataframe

In [15]:
df = pd.DataFrame({'Seconds':df_org['duration (seconds)'], 
                   'Country':df_org['country'],
                   'Latitude':df_org['latitude'],
                   'Longitude':df_org['longitude']})

print(df.info())
print()
print(df.Country.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Seconds    80332 non-null  float64
 1   Country    70662 non-null  object 
 2   Latitude   80332 non-null  float64
 3   Longitude  80332 non-null  float64
dtypes: float64(3), object(1)
memory usage: 2.5+ MB
None

['us' nan 'gb' 'ca' 'au' 'de']


In [16]:
df.dropna(inplace=True)

df=df[(df.Seconds >= 1) & (df.Seconds <= 60)]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25863 entries, 2 to 80330
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Seconds    25863 non-null  float64
 1   Country    25863 non-null  object 
 2   Latitude   25863 non-null  float64
 3   Longitude  25863 non-null  float64
dtypes: float64(3), object(1)
memory usage: 1010.3+ KB


In [18]:
df.Country = LabelEncoder().fit_transform(df.Country)

df.head()

Unnamed: 0,Seconds,Country,Latitude,Longitude
2,20.0,3,53.2,-2.916667
3,20.0,4,28.978333,-96.645833
14,30.0,4,35.823889,-80.253611
23,60.0,4,45.582778,-122.352222
24,3.0,3,51.783333,-0.783333


## Visualize Data

In [19]:
plot_correlation(df)

Unnamed: 0,Seconds,Country,Latitude,Longitude
Seconds,1.0,4.1e-05,-0.002453,0.040644
Country,4.1e-05,1.0,0.080515,-0.428201
Latitude,-0.002453,0.080515,1.0,-0.371981
Longitude,0.040644,-0.428201,-0.371981,1.0


## Model

In [20]:
ts = TrainingSet(df, ['Seconds', 'Latitude', 'Longitude'], 'Country', test_size=0.2, random_state=0)

ts.Build_LogisticRegression()

ts.Print_ClassificationReport()

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       0.83      0.23      0.36       250
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00       131
           4       0.96      1.00      0.98      4743

    accuracy                           0.96      5173
   macro avg       0.96      0.85      0.87      5173
weighted avg       0.96      0.96      0.95      5173

Predicted labels:  [4 4 4 ... 3 4 4]
Accuracy  0.9605644693601392


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Pickle

In [21]:
model_filename = 'ufo-model.pkl'
pickle.dump(ts.model, open(model_filename, 'wb'))

model = pickle.load(open(model_filename, 'rb'))
print(model.predict([[50, 44, -12]]))

[1]
