<a href="https://colab.research.google.com/github/wanderloop/WanderlustAI/blob/master/coo_to_midpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from numpy import nan, reshape, int16, float32
from pandas import DataFrame, read_csv, concat, set_option
set_option('display.max_columns', None)
set_option('display.max_rows', None)
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
!pip install dtale
import dtale
import dtale.app as dtale_app
from plotly.figure_factory import create_table
from sklearn.impute import KNNImputer
from sklearn.pipeline import make_pipeline
!pip install tpot
from tpot import TPOTRegressor
from scipy.stats import t, skewtest
!pip install researchpy
from researchpy import summary_cont
from gc import collect
from IPython.display import clear_output
clear_output()

In [None]:
!pip list | grep numpy
!pip list | grep pandas
!pip list | grep dtale
!pip list | grep plotly
!pip list | grep scikit-learn
!pip list | grep scipy
!pip list | grep researchpy
!pip list | grep ipython
!pip list | grep opencv-python

numpy                         1.18.5         
pandas                        1.1.2          
pandas-datareader             0.9.0          
pandas-gbq                    0.13.2         
pandas-profiling              1.4.1          
sklearn-pandas                1.8.0          
dtale                         1.16.0         
plotly                        4.4.1          
scikit-learn                  0.22.2.post1   
scipy                         1.4.1          
researchpy                    0.2.3          
ipython                       5.5.0          
ipython-genutils              0.2.0          
ipython-sql                   0.3.9          
opencv-python                 4.1.2.30       


In [None]:
%%writefile requirements.txt

numpy==1.18.5
pandas==1.1.2
dtale==1.16.0
plotly==4.4.1
scikit-learn==0.22.2.post1
scipy==1.4.1
researchpy==0.2.3
ipython==5.5.0
catboost==0.24.1
opencv-python==4.1.2.30
regex==2019.12.20

Writing requirements.txt


In [2]:
df = read_csv('https://raw.githubusercontent.com/wanderloop/WanderlustAI/master/assumed_pha_thousand.csv',
              
              dtype={'X1': 'int16',
                     'Y1': 'int16',
                     'X2': 'int16',
                     'Y2': 'int16',
                     'Area': 'int16',
                     'long': 'float32',
                     'lat': 'float32',
                     'Long_minus': 'int16',
                     'Lat_minus': 'int16',},
              
              low_memory=True,
              usecols='X1 X2 long lat'.split())

data = create_table(df.head())
data.show()

In [3]:
# Creating new feature: MPX and MPY
df['MPX'] = (df['X1'] + df['X2']) / 2
df = df.drop(columns = ['X1', 'X2'])
             
df['MPX'] = df['MPX'].astype('float32')
data = create_table(df.head())
data.show()

In [None]:
del data
collect() # Garbage collection

41484

In [None]:
print(df.columns.values)

['long' 'lat' 'MPX']


In [None]:
tail = create_table(df.tail())
tail.show()

In [None]:
del tail
collect()

19072

In [None]:
frames = [df.head(), df.tail()]
conc_data_row = concat(frames,
                       axis=0, # Row-wise operation
                       join='outer',) # Returns both dataframes' columns

conc_data_row = create_table(conc_data_row)
conc_data_row.show()

In [None]:
del conc_data_row
collect()

22300

In [None]:
assert df['long'].notnull().all()
assert df['lat'].notnull().all()
assert df['MPX'].notnull().all()

In [None]:
# Dropping columns with more than 50% missing values
df = df.dropna(thresh=df.shape[0]*0.5, 
               how='all', # Only drop columns when all values are NA
               axis='columns',)
df.shape

(18, 3)

In [None]:
df.info(verbose=True,
        memory_usage='deep',) # Actual memory usage calculation

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   long    18 non-null     float32
 1   lat     18 non-null     float32
 2   MPX     18 non-null     float32
dtypes: float32(3)
memory usage: 344.0 bytes


In [None]:
dtale_app.USE_COLAB = True
report = dtale.show(df,
                    ignore_duplicate=True,)
report

https://933ukzxu2u-496ff2e9c6d22116-40000-colab.googleusercontent.com/dtale/main/8

In [None]:
# Calculating t-statistic
sample_size = 18
print(t.ppf(1-0.025, sample_size - 1))

2.1098155778331806


In [None]:
# Checking the distribution of the target variable
DataToTest = df['MPX']
stat, p = skewtest(DataToTest)
print(f'stat={stat}', 
      f'p={p}')

if p > 0.001:
    print('Normal distribution')
else:
    print('Not a normal distribution')

stat=-0.42732514196296806 p=0.669142522790469
Normal distribution


In [None]:
summary = summary_cont(df[['long',
                           'lat',
                           'MPX',]])

summary = create_table(summary)
summary.show()





In [None]:
any(df.long <= 0)

False

In [None]:
any(df.lat <= 0)

False

In [None]:
any(df.MPX <= 0)

False

In [4]:
# Define predictor (X) and target variables (y)
X = df[['long', 'lat']]
y = df['MPX'].to_numpy().reshape(-1, 1)

In [5]:
imp = KNNImputer(missing_values=nan, # Missing values marked as nan
                 n_neighbors=5, # No. of nearby values
                 weights='distance', # Nearest value = Greatest influence
                 metric='nan_euclidean',)

In [6]:
regr = TPOTRegressor(generations=5, 
                     population_size=100,
                     scoring='neg_root_mean_squared_error',
                     cv=10,
                     n_jobs=-1, # Use all CPU cores
                     random_state=1,
                     verbosity=2,)

In [7]:
pipeline = make_pipeline(imp, regr)
pipeline.fit(X, y.reshape(-1))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: -89.45491886162515
Generation 2 - Current best internal CV score: -72.70950834563817
Generation 3 - Current best internal CV score: -72.51224784851074
Generation 4 - Current best internal CV score: -72.51224784851074

Best pipeline: XGBRegressor(RandomForestRegressor(RBFSampler(input_matrix, gamma=0.7000000000000001), bootstrap=True, max_features=0.6000000000000001, min_samples_leaf=6, min_samples_split=7, n_estimators=100), learning_rate=0.1, max_depth=9, min_child_weight=2, n_estimators=100, nthread=1, objective=reg:squarederror, subsample=0.6500000000000001)


Pipeline(memory=None,
         steps=[('knnimputer',
                 KNNImputer(add_indicator=False, copy=True,
                            metric='nan_euclidean', missing_values=nan,
                            n_neighbors=5, weights='distance')),
                ('tpotregressor',
                 TPOTRegressor(config_dict=None, crossover_rate=0.1, cv=10,
                               disable_update_check=False, early_stop=None,
                               generations=5,
                               log_file=<ipykernel.iostream.OutStream object at 0x7ff35d0c9b00>,
                               max_eval_time_mins=5, max_time_mins=None,
                               memory=None, mutation_rate=0.9, n_jobs=-1,
                               offspring_size=None,
                               periodic_checkpoint_folder=None,
                               population_size=100, random_state=1,
                               scoring='neg_root_mean_squared_error',
                    