## Import Dependencies

In [1]:
import numpy as np 
import pandas as pd 
import os 
from matplotlib import pyplot as plt
from matplotlib import style

## Read CSV and explore

In [2]:
full_df = pd.read_csv(os.path.join("..", "Resources", "MetObjectsML_train.csv"))
full_df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Object End Date
0,False,False,True,Greek and Roman Art,2011,1,100
1,False,False,True,Greek and Roman Art,2011,1,100
2,False,False,True,Greek and Roman Art,2011,1,100
3,False,False,True,Greek and Roman Art,2011,1,200
4,False,False,True,Asian Art,2010,1,99


In [3]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402609 entries, 0 to 402608
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Is Highlight       402609 non-null  bool  
 1   Is Timeline Work   402609 non-null  bool  
 2   Is Public Domain   402609 non-null  bool  
 3   Department         402609 non-null  object
 4   AccessionYear      402609 non-null  int64 
 5   Object Begin Date  402609 non-null  int64 
 6   Object End Date    402609 non-null  int64 
dtypes: bool(3), int64(3), object(1)
memory usage: 13.4+ MB


In [4]:
full_df.describe()

Unnamed: 0,AccessionYear,Object Begin Date,Object End Date
count,402609.0,402609.0,402609.0
mean,1956.775477,1718.029023,1766.843245
std,33.127004,343.50408,296.131975
min,1870.0,1.0,1.0
25%,1932.0,1680.0,1735.0
50%,1960.0,1822.0,1870.0
75%,1980.0,1900.0,1916.0
max,2020.0,5000.0,15335.0


## Convert boolean columns to integer

In [5]:
full_df["Is Public Domain"] = full_df["Is Public Domain"]*1

In [6]:
full_df["Is Highlight"] = full_df["Is Highlight"]*1

In [7]:
full_df["Is Timeline Work"] = full_df["Is Timeline Work"]*1

In [8]:
full_df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Object End Date
0,0,0,1,Greek and Roman Art,2011,1,100
1,0,0,1,Greek and Roman Art,2011,1,100
2,0,0,1,Greek and Roman Art,2011,1,100
3,0,0,1,Greek and Roman Art,2011,1,200
4,0,0,1,Asian Art,2010,1,99


In [9]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402609 entries, 0 to 402608
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Is Highlight       402609 non-null  int32 
 1   Is Timeline Work   402609 non-null  int32 
 2   Is Public Domain   402609 non-null  int32 
 3   Department         402609 non-null  object
 4   AccessionYear      402609 non-null  int64 
 5   Object Begin Date  402609 non-null  int64 
 6   Object End Date    402609 non-null  int64 
dtypes: int32(3), int64(3), object(1)
memory usage: 16.9+ MB


## Convert Departments (object) columns to integer

In [10]:
full_df["Department"] = full_df["Department"].map({"Ancient Near Eastern Art": 1,\
                                                     "Arms and Armor": 2,\
                                                     "Arts of Africa, Oceania, and the Americas": 3,\
                                                     "Asian Art": 4,\
                                                     "Costume Institute" : 5,\
                                                     "Drawings and Prints": 6,\
                                                     "Egyptian Art": 7,\
                                                     "European Paintings": 8,\
                                                     "European Sculpture and Decorative Arts": 9,\
                                                     "Greek and Roman Art": 10,\
                                                     "Islamic Art": 11,\
                                                     "Medieval Art": 12,\
                                                     "Modern and Contemporary Art": 13,\
                                                     "Musical Instruments": 14,\
                                                     "Photographs": 15,\
                                                     "Robert Lehman Collection": 16,\
                                                     "The American Wing": 17,\
                                                     "The Cloisters": 18,\
                                                     "The Libraries": 19})

In [11]:
full_df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Object End Date
0,0,0,1,10,2011,1,100
1,0,0,1,10,2011,1,100
2,0,0,1,10,2011,1,100
3,0,0,1,10,2011,1,200
4,0,0,1,4,2010,1,99


In [12]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402609 entries, 0 to 402608
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   Is Highlight       402609 non-null  int32
 1   Is Timeline Work   402609 non-null  int32
 2   Is Public Domain   402609 non-null  int32
 3   Department         402609 non-null  int64
 4   AccessionYear      402609 non-null  int64
 5   Object Begin Date  402609 non-null  int64
 6   Object End Date    402609 non-null  int64
dtypes: int32(3), int64(4)
memory usage: 16.9 MB


## Calculate Object Age/Met Possession (years) and confirm value counts

In [13]:
full_df.insert(6, "Current Year", 2021)

In [14]:
full_df["Object Age"] = full_df["Current Year"] - full_df["Object End Date"] 

In [15]:
full_df["Met Possession"]= full_df["Current Year"] - full_df["AccessionYear"]

In [16]:
full_df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Current Year,Object End Date,Object Age,Met Possession
0,0,0,1,10,2011,1,2021,100,1921,10
1,0,0,1,10,2011,1,2021,100,1921,10
2,0,0,1,10,2011,1,2021,100,1921,10
3,0,0,1,10,2011,1,2021,200,1821,10
4,0,0,1,4,2010,1,2021,99,1922,11


In [17]:
df= full_df
df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Current Year,Object End Date,Object Age,Met Possession
0,0,0,1,10,2011,1,2021,100,1921,10
1,0,0,1,10,2011,1,2021,100,1921,10
2,0,0,1,10,2011,1,2021,100,1921,10
3,0,0,1,10,2011,1,2021,200,1821,10
4,0,0,1,4,2010,1,2021,99,1922,11


In [20]:
df["Is Public Domain"].value_counts()

0    211100
1    191509
Name: Is Public Domain, dtype: int64

In [21]:
df["Is Timeline Work"].value_counts()

0    395751
1      6858
Name: Is Timeline Work, dtype: int64

In [22]:
df["Is Highlight"].value_counts()

0    400772
1      1837
Name: Is Highlight, dtype: int64

## GroupBys

In [34]:
# Groupy Accession Year- Object Age
object_age_df= df.groupby(["AccessionYear"]).agg({"Object Age": ["mean"]})

object_age_df.columns= ["Average Object Age"]

object_age_df= object_age_df.reset_index()

object_age_df.head()

Unnamed: 0,AccessionYear,Average Object Age
0,1870,1796.0
1,1871,357.866667
2,1872,233.75
3,1873,182.19697
4,1874,1727.442897
5,1875,318.2
6,1876,222.545455
7,1877,405.2
8,1878,143.0
9,1879,246.779817


In [35]:
object_age_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   AccessionYear       151 non-null    int64  
 1   Average Object Age  151 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 2.5 KB


In [37]:
# Groupy Department- Met Possession
met_pos_df= df.groupby(["Department"]).agg({"Met Possession": ["mean"]})

met_pos_df.columns= ["Average Years of Met Possession"]

met_pos_df= met_pos_df.reset_index()

.head()

Unnamed: 0,Department,Average Years of Met Possession
0,1,80.802817
1,2,87.621812
2,3,41.01589
3,4,77.561104
4,5,37.613161


In [39]:
# Groupy Department- Met Possession
dept_age_df= df.groupby(["Department"]).agg({"Object Age": ["mean"]})

dept_age_df.columns= ["Average Object Age"]

dept_age_df= dept_age_df.reset_index()

dept_age_df.head()

Unnamed: 0,Department,Average Object Age
0,1,1561.140063
1,2,281.704181
2,3,357.891658
3,4,320.653561
4,5,94.248611


## Save DataFrames as CSVs for Tableau

In [27]:
#df.to_csv(r'C:\Users\misrael\Documents\GitHub\Project-3-Group-2\Resources\Object_Age_Met_Possession.csv', index = False, header=True)

In [40]:
object_age_df.to_csv(r'C:\Users\misrael\Documents\GitHub\Project-3-Group-2\Resources\Object_Age_Accession_Year.csv', index = False, header=True)

In [41]:
met_pos_df.to_csv(r'C:\Users\misrael\Documents\GitHub\Project-3-Group-2\Resources\Dept_Met_Possession.csv', index = False, header=True)

In [42]:
dept_age_df.to_csv(r'C:\Users\misrael\Documents\GitHub\Project-3-Group-2\Resources\Dept_Object_Age.csv', index = False, header=True)