## Imports and Read-in

In [1]:
import os
import pandas as pd
from pandas_geojson import to_geojson, write_geojson

In [2]:
### Data as of 12.28.22 ##
df = pd.read_csv('Housing_Litigations.csv')

## Data Index:
df - Original read in of data. No changes <br>
df2 - DF of data filtered to only have Tenant Action/Harassment `CaseType`s <br>
df_SomeYear - A DF of all the cases filed that year

## Data Cleaner

In [3]:
df.columns

Index(['LitigationID', 'BuildingID', 'Boro', 'HouseNumber', 'StreetName',
       'Zip', 'Block', 'Lot', 'CaseType', 'CaseOpenDate', 'CaseStatus',
       'OpenJudgement', 'FindingOfHarassment', 'FindingDate', 'Penalty',
       'Respondent', 'Latitude', 'Longitude', 'Community District',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA'],
      dtype='object')

In [4]:
## Reveal case types
df['CaseType'].value_counts()

Tenant Action                   84340
Heat and Hot Water              48602
Access Warrant - Non-Lead       17768
Tenant Action/Harrassment       14439
False Certification Non-Lead     7524
Comprehensive                    7244
Access Warrant - lead            3916
CONH                             2364
Comp Supplemental Cases          2037
Lead False Certification          675
7A                                358
Heat Supplemental Cases           317
Failure to Register Only          245
HLD - Other Case Type               8
Name: CaseType, dtype: int64

In [50]:
## Create new df with only Tenant Action/Harrassment
df2 = df.loc[df['CaseType'] == 'Tenant Action/Harrassment']
df2['Latitude'].isna().value_counts()

False    14416
True        23
Name: Latitude, dtype: int64

In [51]:
df2.to_csv('TenantHarassmentCasesOnly.csv')

In [6]:
## Remove Rows without location information
df2 = df2.dropna(subset='Latitude')

In [7]:
## Check to see if NAs remain
df2['Latitude'].isna().value_counts()

False    14416
Name: Latitude, dtype: int64

In [8]:
## Check data type
df['CaseOpenDate']

0         05/18/2016
1         03/03/2020
2         01/25/2011
3         10/25/2013
4         06/28/2010
             ...    
189832    11/30/2021
189833    11/25/2022
189834    11/22/2022
189835    11/10/2022
189836    10/27/2022
Name: CaseOpenDate, Length: 189837, dtype: object

In [9]:
## Create new column with the year the case was filed
df2['CaseOpenYear'] = df2['CaseOpenDate'].str[6:]

In [10]:
## Value counts for open years
df2.CaseOpenYear.value_counts()

2022    1879
2019    1627
2021    1608
2018    1205
2020    1131
2016    1106
2017    1081
2015     897
2014     803
2013     774
2012     654
2010     493
2011     444
2009     440
2008     272
2004       1
2030       1
Name: CaseOpenYear, dtype: int64

In [11]:
## Remove NAs
df2 = df2.fillna('N/A')

In [12]:
## Convert to string. NOTE: for some reasons, you can't write integers
## to a geoJson file. Strings only.
df2['CaseOpenYear'] = df2['CaseOpenYear'].astype(str)

In [13]:
## Create separate DF for case years 2018 through 2022
df_2018 = df2.loc[df2['CaseOpenYear'] == '2018']
df_2019 = df2.loc[df2['CaseOpenYear'] == '2019']
df_2020 = df2.loc[df2['CaseOpenYear'] == '2020']
df_2021 = df2.loc[df2['CaseOpenYear'] == '2021']
df_2022 = df2.loc[df2['CaseOpenYear'] == '2022']

### Make a GeoJson file for each year, 2018 through 2022

In [14]:
geo_json = to_geojson(df=df2, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment.geojson', indent=4)

In [15]:
geo_json = to_geojson(df=df_2018, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment2018.geojson', indent=4)

In [16]:
geo_json = to_geojson(df=df_2019, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment2019.geojson', indent=4)

In [17]:
geo_json = to_geojson(df=df_2020, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment2020.geojson', indent=4)

In [18]:
geo_json = to_geojson(df=df_2021, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment2021.geojson', indent=4)

In [19]:
geo_json = to_geojson(df=df_2022, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment2022.geojson', indent=4)

In [20]:
base_name = 'https://trd-digital.github.io/trd-news-interactive-maps/NYC_tenant_harassment'

In [21]:
print(base_name)

https://trd-digital.github.io/trd-news-interactive-maps/NYC_tenant_harassment


## Data Analysis

In [22]:
print(len(df2))
print('------')
print(len(df_2018))
print('------')
print(len(df_2019))
print('------')
print(len(df_2020))
print('------')
print(len(df_2021))
print('------')
print(len(df_2022))

14416
------
1205
------
1627
------
1131
------
1608
------
1879


In [23]:
df2.FindingOfHarassment.value_counts()

N/A              10499
No Harassment     3723
After Inquest      151
After Trial         43
Name: FindingOfHarassment, dtype: int64

In [24]:
df2['CaseOpenYear'].value_counts()

2022    1879
2019    1627
2021    1608
2018    1205
2020    1131
2016    1106
2017    1081
2015     897
2014     803
2013     774
2012     654
2010     493
2011     444
2009     440
2008     272
2004       1
2030       1
Name: CaseOpenYear, dtype: int64

In [25]:
df2['Penalty'] = df2['Penalty'].str.replace('N/A','0')

In [26]:
df.columns

Index(['LitigationID', 'BuildingID', 'Boro', 'HouseNumber', 'StreetName',
       'Zip', 'Block', 'Lot', 'CaseType', 'CaseOpenDate', 'CaseStatus',
       'OpenJudgement', 'FindingOfHarassment', 'FindingDate', 'Penalty',
       'Respondent', 'Latitude', 'Longitude', 'Community District',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA'],
      dtype='object')

In [27]:
df.groupby('CaseType')['Penalty'].sum()

CaseType
7A                                   0.0
Access Warrant - Non-Lead            0.0
Access Warrant - lead                0.0
CONH                                 0.0
Comp Supplemental Cases              0.0
Comprehensive                        0.0
Failure to Register Only             0.0
False Certification Non-Lead         0.0
HLD - Other Case Type                0.0
Heat Supplemental Cases              0.0
Heat and Hot Water                   0.0
Lead False Certification             0.0
Tenant Action                        0.0
Tenant Action/Harrassment       942952.0
Name: Penalty, dtype: float64

In [28]:
# df2['Penalty'] = pd.to_numeric(df['Penalty'])

In [29]:
# ## For all years in the data
# print(f'${df2.Penalty.sum():,}')

In [30]:
# df2.groupby('CaseOpenYear')['Penalty'].sum()

In [31]:
df3 = pd.concat([df_2018, df_2019, df_2020, df_2021, df_2022])

In [32]:
df3.CaseOpenYear.value_counts()

2022    1879
2019    1627
2021    1608
2018    1205
2020    1131
Name: CaseOpenYear, dtype: int64

In [33]:
# df3['Penalty'] = df3['Penalty'].str.replace('N/A','0')
# df3.Penalty = pd.to_numeric(df3['Penalty'])

In [34]:
## Respondent value counts
# df3.Respondent.value_counts().head(60)

## Set up for scatter graphic (not used)

In [35]:
# df3['CaseOpenDate'] = pd.to_datetime(df3['CaseOpenDate'])
# m = df3.groupby(pd.Grouper(key='CaseOpenDate',freq='M'))['LitigationID'].count()
# m.to_csv('cases_by_year')
# m = pd.read_csv('cases_by_year')
# m['CaseOpenDate'] = m['CaseOpenDate'].str[:-3]

In [36]:
# import matplotlib.pyplot as plt

In [37]:
# year_month = m['CaseOpenDate']
# case_load = m['LitigationID']

## Chunk up data by outcome

In [38]:
df3.columns

Index(['LitigationID', 'BuildingID', 'Boro', 'HouseNumber', 'StreetName',
       'Zip', 'Block', 'Lot', 'CaseType', 'CaseOpenDate', 'CaseStatus',
       'OpenJudgement', 'FindingOfHarassment', 'FindingDate', 'Penalty',
       'Respondent', 'Latitude', 'Longitude', 'Community District',
       'Council District', 'Census Tract', 'BIN', 'BBL', 'NTA',
       'CaseOpenYear'],
      dtype='object')

In [39]:
df3['CaseStatus'].value_counts()

CLOSED     6450
PENDING    1000
Name: CaseStatus, dtype: int64

In [40]:
df3['OpenJudgement'].value_counts()

NO     7421
YES      29
Name: OpenJudgement, dtype: int64

In [41]:
df3['FindingOfHarassment'].value_counts()

N/A              4584
No Harassment    2745
After Inquest      88
After Trial        33
Name: FindingOfHarassment, dtype: int64

In [42]:
print(len(df3))
print(len('----'))
print(len(df2))

7450
4
14416


In [43]:
df3['Penalty'] = df['Penalty'].astype(str)
df3['Penalty'] = df3['Penalty'].str.replace('nan','None Reported')

In [44]:
df3.Penalty.value_counts()

None Reported    7329
2000.0             50
0.0                33
5000.0              8
10000.0             4
4000.0              3
1000.0              3
3000.0              3
2500.0              2
6000.0              2
500.0               1
5590.0              1
2360.0              1
560.0               1
41160.0             1
29550.0             1
90830.0             1
7500.0              1
56100.0             1
14000.0             1
80500.0             1
9000.0              1
8000.0              1
Name: Penalty, dtype: int64

In [45]:
df_NA = df3.loc[df3['FindingOfHarassment'] == 'N/A']
df_NH = df3.loc[df3['FindingOfHarassment'] == 'No Harassment']
df_AI = df3.loc[df3['FindingOfHarassment'] == 'After Inquest']
df_AT = df3.loc[df3['FindingOfHarassment'] == 'After Trial']

In [46]:
geo_json = to_geojson(df=df_NA, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment_NA.geojson', indent=4)

In [47]:
geo_json = to_geojson(df=df_NH, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment_NH.geojson', indent=4)

In [48]:
geo_json = to_geojson(df=df_AI, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment_AI.geojson', indent=4)

In [49]:
geo_json = to_geojson(df=df_AT, lat='Latitude', lon='Longitude',
                     properties=['LitigationID','BuildingID','Boro','HouseNumber','StreetName','CaseOpenDate','CaseStatus','OpenJudgement','FindingOfHarassment','FindingDate','Penalty','Respondent','CaseOpenYear'])
write_geojson(geo_json, filename='NYC_tenant_harassment_AT.geojson', indent=4)