## Preprocessing

In [3]:
# Import our dependencies
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
import pandas as pd
# import tensorflow as tf
import matplotlib.pyplot as plt

# IRS Data

In [4]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
irs_data_df = pd.read_csv("Resources/IRS_NY_2020.csv")
irs_data_df

Unnamed: 0,ZipCode,Number of returns,Total income Amount (In thousand of dollars),Average total income
0,10001,15590,2830868,181582.29630
1,10002,41020,2697332,65756.50902
2,10003,25780,6953211,269713.38250
3,10004,2220,1047897,472025.67570
4,10005,5530,2374656,429413.38160
...,...,...,...,...
1533,14901,5460,222977,40838.27839
1534,14903,3630,246522,67912.39669
1535,14904,6900,287843,41716.37681
1536,14905,4330,318597,73578.98383


In [5]:
# View the data types
print(irs_data_df.dtypes)

ZipCode                                           int64
Number of returns                                 int64
Total income Amount (In thousand of dollars)      int64
Average total income                            float64
dtype: object


In [6]:
# Convert float64 data to integers
irs_data_df['Average total income'] = irs_data_df['Average total income'].astype('int')
print(irs_data_df.dtypes)

ZipCode                                         int64
Number of returns                               int64
Total income Amount (In thousand of dollars)    int64
Average total income                            int32
dtype: object


# NYC Licensing Data

In [7]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
nyc_licensing_df = pd.read_csv("Resources/NYC_Dog_Licensing_Dataset_20240218.csv")
nyc_licensing_df

  nyc_licensing_df = pd.read_csv("Resources/NYC_Dog_Licensing_Dataset_20240218.csv")


Unnamed: 0,AnimalName,AnimalGender,AnimalBirthYear,BreedName,ZipCode,LicenseIssuedDate,LicenseExpiredDate,Extract Year
0,PAIGE,F,2014,American Pit Bull Mix / Pit Bull Mix,10035.0,09/12/2014,09/12/2017,2016
1,YOGI,M,2010,Boxer,10465.0,09/12/2014,10/02/2017,2016
2,ALI,M,2014,Basenji,10013.0,09/12/2014,09/12/2019,2016
3,QUEEN,F,2013,Akita Crossbreed,10013.0,09/12/2014,09/12/2017,2016
4,LOLA,F,2009,Maltese,10028.0,09/12/2014,10/09/2017,2016
...,...,...,...,...,...,...,...,...
616885,SKYE,F,2016,Great Pyrenees,11218.0,11/01/2023,12/02/2024,2023
616886,UNKNOWN,F,2023,Shih Tzu Crossbreed,10022.0,11/01/2023,11/01/2024,2023
616887,MUNYU,M,2009,"Poodle, Toy",11355.0,11/01/2023,11/24/2024,2023
616888,SAINT,M,2021,Unknown,11412.0,11/01/2023,11/01/2024,2023


In [8]:
# View the data types
print(nyc_licensing_df.dtypes)

AnimalName             object
AnimalGender           object
AnimalBirthYear        object
BreedName              object
ZipCode               float64
LicenseIssuedDate      object
LicenseExpiredDate     object
Extract Year            int64
dtype: object


In [9]:
# Replace NaN with 0 
nyc_licensing_df['ZipCode'].fillna(0, inplace=True)

# Convert to integer
nyc_licensing_df['ZipCode'] = nyc_licensing_df['ZipCode'].astype('int')
print(nyc_licensing_df.dtypes)

AnimalName            object
AnimalGender          object
AnimalBirthYear       object
BreedName             object
ZipCode                int32
LicenseIssuedDate     object
LicenseExpiredDate    object
Extract Year           int64
dtype: object


In [10]:
# Determine the number of unique values in each column.
nyc_licensing_df.nunique()

AnimalName            32083
AnimalGender              2
AnimalBirthYear          76
BreedName              1591
ZipCode                 911
LicenseIssuedDate      3337
LicenseExpiredDate     4869
Extract Year              5
dtype: int64

# Merge IRS NY data with NYC Licensing Zip Code data

In [11]:
# Combine two csv files by using the ZipCode Column.
merged_df = pd.merge(irs_data_df, nyc_licensing_df, on='ZipCode', how='left')
merged_df

Unnamed: 0,ZipCode,Number of returns,Total income Amount (In thousand of dollars),Average total income,AnimalName,AnimalGender,AnimalBirthYear,BreedName,LicenseIssuedDate,LicenseExpiredDate,Extract Year
0,10001,15590,2830868,181582,BANDIT,M,2014,Shih Tzu,10/07/2014,10/07/2016,2016.0
1,10001,15590,2830868,181582,SCOOBY,M,2012,"Schnauzer, Miniature",11/19/2014,09/06/2016,2016.0
2,10001,15590,2830868,181582,GRACE,F,2011,Shiba Inu,11/25/2014,11/25/2016,2016.0
3,10001,15590,2830868,181582,TUCKER,M,2005,"Bull Dog, English",12/03/2014,01/30/2016,2016.0
4,10001,15590,2830868,181582,DAISY,F,2012,"Bull Dog, English",01/09/2015,01/09/2017,2016.0
...,...,...,...,...,...,...,...,...,...,...,...
612861,14901,5460,222977,40838,CHESTER,M,2021,Maltipoo,01/16/2023,01/27/2024,2023.0
612862,14903,3630,246522,67912,ZEKE,M,2022,German Shepherd Dog,07/28/2022,07/28/2023,2023.0
612863,14904,6900,287843,41716,,,,,,,
612864,14905,4330,318597,73578,,,,,,,


In [12]:
# Drop the non-beneficial columns
merged_df.drop(['Number of returns', 'Total income Amount (In thousand of dollars)', 'AnimalName'], axis=1, inplace=True)
merged_df

Unnamed: 0,ZipCode,Average total income,AnimalGender,AnimalBirthYear,BreedName,LicenseIssuedDate,LicenseExpiredDate,Extract Year
0,10001,181582,M,2014,Shih Tzu,10/07/2014,10/07/2016,2016.0
1,10001,181582,M,2012,"Schnauzer, Miniature",11/19/2014,09/06/2016,2016.0
2,10001,181582,F,2011,Shiba Inu,11/25/2014,11/25/2016,2016.0
3,10001,181582,M,2005,"Bull Dog, English",12/03/2014,01/30/2016,2016.0
4,10001,181582,F,2012,"Bull Dog, English",01/09/2015,01/09/2017,2016.0
...,...,...,...,...,...,...,...,...
612861,14901,40838,M,2021,Maltipoo,01/16/2023,01/27/2024,2023.0
612862,14903,67912,M,2022,German Shepherd Dog,07/28/2022,07/28/2023,2023.0
612863,14904,41716,,,,,,
612864,14905,73578,,,,,,


In [13]:
# View the data types
print(merged_df.dtypes)

ZipCode                   int64
Average total income      int32
AnimalGender             object
AnimalBirthYear          object
BreedName                object
LicenseIssuedDate        object
LicenseExpiredDate       object
Extract Year            float64
dtype: object


In [14]:
# Convert to integer
merged_df['Average total income'] = merged_df['Average total income'].astype('int')
print(merged_df.dtypes)

ZipCode                   int64
Average total income      int32
AnimalGender             object
AnimalBirthYear          object
BreedName                object
LicenseIssuedDate        object
LicenseExpiredDate       object
Extract Year            float64
dtype: object


In [15]:
# Replace NaN with 0 
merged_df['Extract Year'].fillna(0, inplace=True)

# Convert to integer
merged_df['Extract Year'] = merged_df['Extract Year'].astype('int')
print(merged_df.dtypes)

ZipCode                  int64
Average total income     int32
AnimalGender            object
AnimalBirthYear         object
BreedName               object
LicenseIssuedDate       object
LicenseExpiredDate      object
Extract Year             int32
dtype: object


In [16]:
# Determine the number of unique values in each column.
merged_df.nunique()

ZipCode                 1538
Average total income    1515
AnimalGender               2
AnimalBirthYear           76
BreedName               1584
LicenseIssuedDate       3337
LicenseExpiredDate      4868
Extract Year               6
dtype: int64

In [17]:
# Export DataFrame to a CSV file
merged_df.to_csv('merged_df.csv', index=False)

# Analyze Data - Review Correlations

In [18]:
# Calculate the correlation between the Average-total-income and ZipCode
correlation = merged_df['Average total income'].corr(merged_df['ZipCode'])
print(correlation)

-0.5135861842173098


# Group ZipCodes to 5 NYC Boroughs

In [19]:
# Brooklyn:11201-11256
# Manhattan:10001-10282
# Queens:11004-11109, 11351-11697
# Staten Island :10301-10314
# Bronx: 10451-10475

In [20]:
# Make five Boroughs
def categorize_zipcode(zip_code):
    # Brooklyn
    if 11201 <= zip_code <= 11256:
        return 'Brooklyn'
    # Manhattan
    elif 10001 <= zip_code <= 10282:
        return 'Manhattan'
    # Queens
    elif (11004 <= zip_code <= 11109) or (11351 <= zip_code <= 11697):
        return 'Queens'
    # Staten Island
    elif 10301 <= zip_code <= 10314:
        return 'Staten Island'
    # Bronx
    elif 10451 <= zip_code <= 10475:
        return 'Bronx'
    else:
        return 'Others'

# Applying the function to the DataFrame
merged_df['Borough'] = merged_df['ZipCode'].apply(categorize_zipcode)

# Display the DataFrame to verify the categorization
merged_df

Unnamed: 0,ZipCode,Average total income,AnimalGender,AnimalBirthYear,BreedName,LicenseIssuedDate,LicenseExpiredDate,Extract Year,Borough
0,10001,181582,M,2014,Shih Tzu,10/07/2014,10/07/2016,2016,Manhattan
1,10001,181582,M,2012,"Schnauzer, Miniature",11/19/2014,09/06/2016,2016,Manhattan
2,10001,181582,F,2011,Shiba Inu,11/25/2014,11/25/2016,2016,Manhattan
3,10001,181582,M,2005,"Bull Dog, English",12/03/2014,01/30/2016,2016,Manhattan
4,10001,181582,F,2012,"Bull Dog, English",01/09/2015,01/09/2017,2016,Manhattan
...,...,...,...,...,...,...,...,...,...
612861,14901,40838,M,2021,Maltipoo,01/16/2023,01/27/2024,2023,Others
612862,14903,67912,M,2022,German Shepherd Dog,07/28/2022,07/28/2023,2023,Others
612863,14904,41716,,,,,,0,Others
612864,14905,73578,,,,,,0,Others


In [21]:
# Drop rows where 'Borough' column has the value 'Others'
boroughs_df = merged_df[merged_df['Borough'] != 'Others']
boroughs_df

Unnamed: 0,ZipCode,Average total income,AnimalGender,AnimalBirthYear,BreedName,LicenseIssuedDate,LicenseExpiredDate,Extract Year,Borough
0,10001,181582,M,2014,Shih Tzu,10/07/2014,10/07/2016,2016,Manhattan
1,10001,181582,M,2012,"Schnauzer, Miniature",11/19/2014,09/06/2016,2016,Manhattan
2,10001,181582,F,2011,Shiba Inu,11/25/2014,11/25/2016,2016,Manhattan
3,10001,181582,M,2005,"Bull Dog, English",12/03/2014,01/30/2016,2016,Manhattan
4,10001,181582,F,2012,"Bull Dog, English",01/09/2015,01/09/2017,2016,Manhattan
...,...,...,...,...,...,...,...,...,...
611288,11697,110414,F,2021,German Shepherd Crossbreed,08/24/2023,10/09/2024,2023,Queens
611289,11697,110414,M,2016,Unknown,09/12/2023,08/26/2024,2023,Queens
611290,11697,110414,F,2012,German Shepherd Dog,09/13/2023,11/03/2024,2023,Queens
611291,11697,110414,F,2016,Labrador Retriever Crossbreed,09/15/2023,05/25/2024,2023,Queens


# [TBD] Machine Learning Data Preparation

In [28]:
# Look at ZipCode value counts for binning
nyc_zipcode_count = merged_df['ZipCode'].value_counts()
nyc_zipcode_count

ZipCode
10025    13819
10023    11189
11201    10907
11215    10849
10024    10581
         ...  
12766        1
12765        1
12764        1
12763        1
99999        1
Name: count, Length: 1538, dtype: int64

In [33]:
# Choose a cutoff value and create a list of ZipCode types to be replaced
# use the variable name `ZipCode_types_to_replace`
ZipCode_types_to_replace = list(nyc_zipcode_count[nyc_zipcode_count <= 10].index)

# Replace in dataframe
for a in ZipCode_types_to_replace:
    merged_df['ZipCode'] = merged_df['ZipCode'].replace(a,"Other")

# Check to make sure binning was successful
merged_df['ZipCode'].value_counts()

ZipCode
10025    13819
10023    11189
11201    10907
11215    10849
10024    10581
         ...  
11757       11
11771       11
11977       11
11020       11
10512       11
Name: count, Length: 229, dtype: int64

In [34]:
# Look at Averag-total-income for binning
avg_income_count = merged_df['Average total income'].value_counts()
avg_income_count

Average total income
173862    13819
347467    11189
250671    10907
167787    10849
401029    10581
          ...  
71190         1
66980         1
112220        1
74197         1
200068        1
Name: count, Length: 1515, dtype: int64

In [35]:
# Look at Average-total-income counts > 10
avg_income_count_over_10 = avg_income_count[avg_income_count > 10]
avg_income_count_over_10

Average total income
173862    13819
347467    11189
250671    10907
167787    10849
401029    10581
          ...  
159719       11
136942       11
99210        11
70481        11
195785       11
Name: count, Length: 228, dtype: int64

In [36]:
# Choose a cutoff value and create a list of Average-total-income to be replaced
# use the variable name `avg_income_to_replace`
avg_income_to_replace = list(avg_income_count[avg_income_count <= 20].index)

# Replace in dataframe
for b in avg_income_to_replace:
    merged_df['Average total income'] = merged_df['Average total income'].replace(b,"Other")

# Check to make sure binning was successful
merged_df['Average total income'].value_counts()

Average total income
173862    13819
347467    11189
250671    10907
167787    10849
401029    10581
          ...  
44284        21
88972        21
75670        21
422223       21
321500       21
Name: count, Length: 198, dtype: int64

In [37]:
# Convert categorical data to numeric with `pd.get_dummies`
numeric_data = pd.get_dummies(merged_df)
numeric_data.head()

Unnamed: 0,Extract Year,ZipCode_10001,ZipCode_10002,ZipCode_10003,ZipCode_10004,ZipCode_10005,ZipCode_10006,ZipCode_10007,ZipCode_10009,ZipCode_10010,...,LicenseExpiredDate_12/31/2019,LicenseExpiredDate_12/31/2020,LicenseExpiredDate_12/31/2021,LicenseExpiredDate_12/31/2022,LicenseExpiredDate_12/31/2023,LicenseExpiredDate_12/31/2024,LicenseExpiredDate_12/31/2025,LicenseExpiredDate_12/31/2026,LicenseExpiredDate_12/31/2027,LicenseExpiredDate_12/31/2028
0,2016,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2016,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2016,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2016,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2016,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [38]:
# Split our preprocessed data into our features and target arrays
y = numeric_data['Extract Year']
X = numeric_data.drop(['Extract Year'],axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [39]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

MemoryError: Unable to allocate 16.7 GiB for an array with shape (4868, 459649) and data type float64

## Compile, Train and Evaluate the Model

In [None]:
# Attempt #1
# layer1 = 10 : activation function = relu
# layer2 = 20 : activation function = relu

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 10
hidden_nodes_layer2 = 20


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")