## Import Dependencies

In [1]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 
import os 

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

## Read CSV and explore

In [2]:
full_df = pd.read_csv(os.path.join("..", "Resources", "MetObjectsML_train.csv"))
full_df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Object End Date
0,False,False,True,Greek and Roman Art,2011,1,100
1,False,False,True,Greek and Roman Art,2011,1,100
2,False,False,True,Greek and Roman Art,2011,1,100
3,False,False,True,Greek and Roman Art,2011,1,200
4,False,False,True,Asian Art,2010,1,99


In [3]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402609 entries, 0 to 402608
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Is Highlight       402609 non-null  bool  
 1   Is Timeline Work   402609 non-null  bool  
 2   Is Public Domain   402609 non-null  bool  
 3   Department         402609 non-null  object
 4   AccessionYear      402609 non-null  int64 
 5   Object Begin Date  402609 non-null  int64 
 6   Object End Date    402609 non-null  int64 
dtypes: bool(3), int64(3), object(1)
memory usage: 13.4+ MB


In [4]:
full_df.describe()

Unnamed: 0,AccessionYear,Object Begin Date,Object End Date
count,402609.0,402609.0,402609.0
mean,1956.775477,1718.029023,1766.843245
std,33.127004,343.50408,296.131975
min,1870.0,1.0,1.0
25%,1932.0,1680.0,1735.0
50%,1960.0,1822.0,1870.0
75%,1980.0,1900.0,1916.0
max,2020.0,5000.0,15335.0


## Convert boolean columns to integer

In [5]:
full_df["Is Public Domain"] = full_df["Is Public Domain"]*1

In [6]:
full_df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Object End Date
0,False,False,1,Greek and Roman Art,2011,1,100
1,False,False,1,Greek and Roman Art,2011,1,100
2,False,False,1,Greek and Roman Art,2011,1,100
3,False,False,1,Greek and Roman Art,2011,1,200
4,False,False,1,Asian Art,2010,1,99


In [7]:
full_df["Is Public Domain"].value_counts()

0    211100
1    191509
Name: Is Public Domain, dtype: int64

In [8]:
full_df["Is Highlight"] = full_df["Is Highlight"]*1

In [9]:
full_df["Is Highlight"].value_counts()

0    400772
1      1837
Name: Is Highlight, dtype: int64

In [11]:
full_df["Is Timeline Work"] = full_df["Is Timeline Work"]*1

In [12]:
full_df["Is Timeline Work"].value_counts()

0    395751
1      6858
Name: Is Timeline Work, dtype: int64

## Convert Deptartments (object) columns to integer

In [13]:
full_df["Department"] = full_df["Department"].map({"Ancient Near Eastern Art": 1,\
                                                     "Arms and Armor": 2,\
                                                     "Arts of Africa, Oceania, and the Americas": 3,\
                                                     "Asian Art": 4,\
                                                     "Costume Institute" : 5,\
                                                     "Drawings and Prints": 6,\
                                                     "Egyptian Art": 7,\
                                                     "European Paintings": 8,\
                                                     "European Sculpture and Decorative Arts": 9,\
                                                     "Greek and Roman Art": 10,\
                                                     "Islamic Art": 11,\
                                                     "Medieval Art": 12,\
                                                     "Modern and Contemporary Art": 13,\
                                                     "Musical Instruments": 14,\
                                                     "Photographs": 15,\
                                                     "Robert Lehman Collection": 16,\
                                                     "The American Wing": 17,\
                                                     "The Cloisters": 18,\
                                                     "The Libraries": 19})

In [14]:
full_df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Object End Date
0,0,0,1,10,2011,1,100
1,0,0,1,10,2011,1,100
2,0,0,1,10,2011,1,100
3,0,0,1,10,2011,1,200
4,0,0,1,4,2010,1,99


In [16]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402609 entries, 0 to 402608
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   Is Highlight       402609 non-null  int32
 1   Is Timeline Work   402609 non-null  int32
 2   Is Public Domain   402609 non-null  int32
 3   Department         402609 non-null  int64
 4   AccessionYear      402609 non-null  int64
 5   Object Begin Date  402609 non-null  int64
 6   Object End Date    402609 non-null  int64
dtypes: int32(3), int64(4)
memory usage: 16.9 MB


## Randomly select rows of dataframe for training

In [17]:
sample_df= full_df.sample(n=200000)

In [18]:
#train_df= sample_df.head(200000)

In [19]:
train_df= sample_df
train_df.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Is Public Domain,Department,AccessionYear,Object Begin Date,Object End Date
314015,0,0,1,13,1984,1910,1910
75025,0,0,1,9,1908,1600,1615
127171,0,0,1,9,1976,1728,1730
223194,0,0,0,6,1962,1850,1899
401367,0,0,0,6,2014,2007,2007


In [20]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 314015 to 263195
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   Is Highlight       200000 non-null  int32
 1   Is Timeline Work   200000 non-null  int32
 2   Is Public Domain   200000 non-null  int32
 3   Department         200000 non-null  int64
 4   AccessionYear      200000 non-null  int64
 5   Object Begin Date  200000 non-null  int64
 6   Object End Date    200000 non-null  int64
dtypes: int32(3), int64(4)
memory usage: 9.9 MB


## Build Machine Learning Model- Random Forest

In [21]:
target = train_df["Is Public Domain"]
target_names= ["yes", "no"]

In [22]:
data = train_df.drop("Is Public Domain", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Department,AccessionYear,Object Begin Date,Object End Date
314015,0,0,13,1984,1910,1910
75025,0,0,9,1908,1600,1615
127171,0,0,9,1976,1728,1730
223194,0,0,6,1962,1850,1899
401367,0,0,6,2014,2007,2007


In [23]:
data.head()

Unnamed: 0,Is Highlight,Is Timeline Work,Department,AccessionYear,Object Begin Date,Object End Date
314015,0,0,13,1984,1910,1910
75025,0,0,9,1908,1600,1615
127171,0,0,9,1976,1728,1730
223194,0,0,6,1962,1850,1899
401367,0,0,6,2014,2007,2007


In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df, target, random_state=42)

In [25]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=5000)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

1.0

In [27]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.7494752120743918, 'Department'),
 (0.08977960301970561, 'Object End Date'),
 (0.01910782389068243, 'AccessionYear'),
 (0.018708212783587825, 'Object Begin Date'),
 (0.0007544000838807419, 'Is Timeline Work'),
 (2.1622123907764468e-05, 'Is Highlight')]

In [28]:
df= {"Features": ["Department", "Object End Date", "Accession Year", "Object Begin Date", "Is Timeline Work", "Is Highlight"],\
     "Importances": [0.7494752120743918, 0.08977960301970561, 0.01910782389068243, 0.018708212783587825, 0.0007544000838807419, 2.1622123907764468e-05]}

In [30]:
df= pd.DataFrame(data=df)
df

Unnamed: 0,Features,Importances
0,Department,0.749475
1,Object End Date,0.08978
2,Accession Year,0.019108
3,Object Begin Date,0.018708
4,Is Timeline Work,0.000754
5,Is Highlight,2.2e-05
