# Data imputation techniques

In [13]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import scipy.stats as stats
from sklearn.metrics import mean_squared_error

In [14]:
df = pd.read_csv("uber.csv")

In [15]:
df.isnull().sum()

key                  0
date                 0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

## 1. Deletion of rows with missing  data

In [16]:
df1 = df.dropna()
df1.isnull().sum()

key                  0
date                 0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

## 2. Mean/Median/Mode Imputation

In [17]:
df = pd.read_csv("uber.csv")
longmean = df["dropoff_longitude"].mean()
latmean = df["dropoff_latitude"].mean()
df['dropoff_longitude'].values[87946] = round(longmean,3)
df['dropoff_latitude'].values[87946] = round(latmean,3)

print(df['dropoff_latitude'][87946])
print(df['dropoff_longitude'][87946])

39.924
-72.525


In [18]:
df = pd.read_csv("uber.csv")
longmed = df["dropoff_longitude"].median()
latmed = df["dropoff_latitude"].median()
df['dropoff_longitude'].values[87946] = round(longmed,3)
df['dropoff_latitude'].values[87946] = round(latmed,3)

print(df['dropoff_latitude'][87946])
print(df['dropoff_longitude'][87946])

40.753
-73.98


In [19]:
df4 = df[['dropoff_latitude','dropoff_longitude']]
for i in df4:
    df4[i] = df4[i].fillna(df4[i].mode()[0])
    # print(df4[i].fillna(df4[i].mode()[0]))
df4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4[i] = df4[i].fillna(df4[i].mode()[0])


Unnamed: 0,dropoff_latitude,dropoff_longitude
0,40.723217,-73.999512
1,40.750325,-73.994710
2,40.772647,-73.962565
3,40.803349,-73.965316
4,40.761247,-73.973082
...,...,...
199994,40.740297,-73.986525
199995,40.739620,-74.006672
199996,40.692588,-73.858957
199997,40.695415,-73.983215


## 3. Arbitrary Value Imputation

In [20]:
minLong = df['dropoff_longitude'].min()
maxLong = df['dropoff_longitude'].max()
randLong = np.random.uniform(minLong,maxLong)
dl = df.dropoff_longitude.values[87964] = (randLong)
print(dl)

-1675.90650758639


## 4. End of Tail Imputation

In [21]:
df6 = df[['dropoff_latitude','dropoff_longitude']]
for i in df6:
    eod_value = df6[i].mean() + 3*df6[i].std()
    df6[i] = df6[i].fillna(eod_value)
df6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6[i] = df6[i].fillna(eod_value)


Unnamed: 0,dropoff_latitude,dropoff_longitude
0,40.723217,-73.999512
1,40.750325,-73.994710
2,40.772647,-73.962565
3,40.803349,-73.965316
4,40.761247,-73.973082
...,...,...
199994,40.740297,-73.986525
199995,40.739620,-74.006672
199996,40.692588,-73.858957
199997,40.695415,-73.983215


## 5. Random Sample Imputation

In [22]:
def random_imputation(df, feature):
    number_missing = df[feature].isnull().sum()
    observed_values = df.loc[df[feature].notnull(), feature]
    df.loc[df[feature].isnull(), feature + '_imp'] = np.random.choice(observed_values, number_missing, replace = True)    
    return df

In [23]:
df6 = df[['dropoff_latitude','dropoff_longitude']]
for feature in df6:
    df6[feature + '_imp'] = df6[feature]
    df6 = random_imputation(df6, feature)
df6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6[feature + '_imp'] = df6[feature]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df6[feature + '_imp'] = df6[feature]


Unnamed: 0,dropoff_latitude,dropoff_longitude,dropoff_latitude_imp,dropoff_longitude_imp
0,40.723217,-73.999512,40.723217,-73.999512
1,40.750325,-73.994710,40.750325,-73.994710
2,40.772647,-73.962565,40.772647,-73.962565
3,40.803349,-73.965316,40.803349,-73.965316
4,40.761247,-73.973082,40.761247,-73.973082
...,...,...,...,...
199994,40.740297,-73.986525,40.740297,-73.986525
199995,40.739620,-74.006672,40.739620,-74.006672
199996,40.692588,-73.858957,40.692588,-73.858957
199997,40.695415,-73.983215,40.695415,-73.983215


In [24]:
# df1 = df[['dropoff_latitude', 'dropoff_longitude']]
# for i in df1:
#     df[i].fillna(df.sample(df[i]))
# # print(df1)
# df1.isnull().sum()
sample = (df.sample(df.dropoff_latitude.isnull().sum()))['dropoff_latitude']
# print(sample)
df1 = df.dropoff_latitude.fillna(sample)
df1.info()

<class 'pandas.core.series.Series'>
RangeIndex: 199999 entries, 0 to 199998
Series name: dropoff_latitude
Non-Null Count   Dtype  
--------------   -----  
199999 non-null  float64
dtypes: float64(1)
memory usage: 1.5 MB
