In [1]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
import pandas as pd
import spacy

In [2]:
# Load the dataset that has all the scraped data
df = pd.read_csv("final_scraped_data.csv")

In [3]:
df.shape

(22750, 31)

The path column is inconsistent. Let's fix that

In [4]:
df['image_path']

0                          images\10005.jpg
1                          images\10006.jpg
2                          images\10007.jpg
3                          images\10115.jpg
4                          images\10121.jpg
                        ...                
22745    /collections/assets/199240_800.jpg
22746    /collections/assets/569000_800.jpg
22747    /collections/assets/568903_800.jpg
22748    /collections/assets/142255_800.jpg
22749    /collections/assets/142238_800.jpg
Name: image_path, Length: 22750, dtype: object

In [5]:
# Function to normalize the paths
def normalize_image_path(path):
    # Replace backslashes with forward slashes
    path = path.replace("\\", "/")
    
    # Ensure the path starts with 'images/'
    if not path.startswith("images/"):
        # Get the image file name (last part of the path)
        image_name = path.split("/")[-1]
        # Reconstruct the correct path
        path = f"images/{image_name}"
    
    return path

# Apply the function to the 'image_path' column
df['image_path'] = df['image_path'].apply(normalize_image_path)

In [6]:
df['image_path']

0             images/10005.jpg
1             images/10006.jpg
2             images/10007.jpg
3             images/10115.jpg
4             images/10121.jpg
                 ...          
22745    images/199240_800.jpg
22746    images/569000_800.jpg
22747    images/568903_800.jpg
22748    images/142255_800.jpg
22749    images/142238_800.jpg
Name: image_path, Length: 22750, dtype: object

Name or Object Title is an important attribute and during webscraping we couldn't get the titles for some artifacts.
Dropping the rows with null object title

In [7]:
df['name'].isna().sum()

70

In [8]:
df = df.dropna(subset=['name'])

In [9]:
df['name'].isna().sum()

0

Some artifacts didn't have very specific object titles but a general name. For each artifact that has an object title, we keep the object title, else we replace the null values with the general name of the object

In [10]:
df['Object Title'] = df['Object Title'].fillna(df['name'])

In [11]:
df['Object Title'].isna().sum()

0

The column 'name' is not redundant

In [12]:
df.drop(columns=['name'], inplace=True)

Some other columns are also irrelevant for our project

In [13]:
df.drop(columns=['Current Location','Depth', 'Height', 'Length', 'Other Number', 'Outside Diameter', 
                              'Thickness', 'Width'], inplace=True)

In [14]:
df.shape

(22680, 22)

Now using NLP, we can extract some information from the description.

In [15]:
df['Description'] = df['Description'].str.strip().fillna('No description available')

In [16]:
# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Keep the 'Object Number' column as well
df_nlp = df[['Object Number', 'Description']].copy()  # Create a deep copy

# Convert the 'Description' column to strings (handle NaN values as 'nan')
df_nlp['Description'] = df_nlp['Description'].astype(str)

def extract_entities(description):
    # Process the description with spaCy
    doc = nlp(description)
    
    # Initialize a dictionary to store extracted entities (set to None if not found)
    entities = {
        'Culture d': None,
        'Date Made d': None,
        'Iconography d': None,
        'Materials d': None,
        'Period d': None,
        'Provenience d': None,
        'Technique d': None
    }
    
    # Extract entities
    for ent in doc.ents:
        if ent.label_ == 'MATERIAL':  # Materials
            entities['Materials d'] = ent.text
        elif ent.label_ == 'CULTURE':  # Culture
            entities['Culture d'] = ent.text
        elif ent.label_ == 'DATE' or ent.label_ == 'CARDINAL':  # Date Made / Period
            entities['Date Made d'] = ent.text
        elif ent.label_ == 'ORG':  # Iconography (assuming some organization references)
            entities['Iconography d'] = ent.text
        elif ent.label_ == 'GPE':  # Provenience (Geopolitical locations)
            entities['Provenience d'] = ent.text
        elif ent.label_ == 'TIME':  # Period (time reference)
            entities['Period d'] = ent.text
        elif ent.label_ == 'SKILL' or ent.label_ == 'MATERIAL':  # Technique (sometimes overlaps with Material or Skill)
            entities['Technique d'] = ent.text
    
    # Return the updated dictionary
    return entities

# Apply the function to the 'Description' column
df_nlp['Entities'] = df_nlp['Description'].apply(extract_entities)

# Expand the 'Entities' dictionary into separate columns
for col in df_nlp['Entities'][0].keys():  # Use df_nlp here, not df
    df_nlp[col] = df_nlp['Entities'].apply(lambda x: x[col])

# Optionally drop the 'Entities' column as it is now expanded
df_nlp.drop(columns=['Entities'], inplace=True)

# Display the updated DataFrame for the first 5 rows
print(df_nlp.head(5))  # Display first 5 rows

# Save the updated DataFrame to a new CSV file
df_nlp.to_csv("info_from_description.csv", index=False)

  Object Number                                        Description Culture d  \
0         10005           Showing artificial and natural fracture.      None   
1         10006                           No description available      None   
2         10007                           No description available      None   
3         10115  The fragment of pottery is unusually thick, an...      None   
4         10121                                         Spade like      None   

  Date Made d Iconography d Materials d Period d Provenience d Technique d  
0        None          None        None     None          None        None  
1        None          None        None     None          None        None  
2        None          None        None     None          None        None  
3        None          None        None     None     Tennessee        None  
4        None          None        None     None          None        None  


This information is stored in "info from description.csv"

Let's see how much valuable information does this df have

In [17]:
df_nlp.shape

(22680, 9)

In [18]:
null_values = df_nlp.isna().sum()

# Display the number of null values for each column
print(null_values)

Object Number        0
Description          0
Culture d        22680
Date Made d      14496
Iconography d    16970
Materials d      22680
Period d         22657
Provenience d    19605
Technique d      22680
dtype: int64


We can see that Date Made, Iconography, Period and Provenience are the only columns that have at least some not null values that might be helpful

In [19]:
df_nlp.drop(columns=['Culture d', 'Materials d', 'Technique d'], inplace=True)

Now let's concatinate this with the data that we have and merge the dataframes

In [20]:
df.shape

(22680, 22)

They have the same number of rows and as we didn't manipulate the rows, we can simply concatinate the columns

In [21]:
df = pd.concat([df, df_nlp], axis=1)

In [22]:
df.columns

Index(['Object Number', 'image_path', 'Archaeology Area', 'Creator',
       'Credit Line', 'Culture Area', 'Culture', 'Date Made', 'Description',
       'Iconography', 'Inscription Language', 'Locus', 'Manufacture Location',
       'Materials', 'Native Name', 'Object Title', 'Period', 'Provenience',
       'Section', 'Site Name', 'Subject', 'Technique', 'Object Number',
       'Description', 'Date Made d', 'Iconography d', 'Period d',
       'Provenience d'],
      dtype='object')

Object Number and Description are redundant so we remove that and save the dataframe as csv to not lose our work

In [23]:
# Find duplicated columns
duplicate_columns = df.columns[df.columns.duplicated()]

# Drop duplicated columns, keeping only the first occurrence
df = df.loc[:, ~df.columns.duplicated()]

# Alternatively, if you want to keep track of which columns were dropped:
print(f"Dropped columns: {duplicate_columns}")

Dropped columns: Index(['Object Number', 'Description'], dtype='object')


In [24]:
df.to_csv("Concatinated with NLP columns.csv", index=False)

# Missing Data

Reload again to help me later

In [146]:
df = pd.read_csv("Concatinated with NLP columns.csv")

In [147]:
df.isna().sum()

Object Number               0
image_path                  0
Archaeology Area        19504
Creator                 22049
Credit Line               559
Culture Area            14991
Culture                 12188
Date Made               14992
Description                 0
Iconography             18967
Inscription Language    21385
Locus                   17677
Manufacture Location    21713
Materials                 288
Native Name             21827
Object Title                0
Period                  15622
Provenience               149
Section                     0
Site Name               22140
Subject                 22667
Technique               18097
Date Made d             14496
Iconography d           16970
Period d                22657
Provenience d           19605
dtype: int64

Getting rid of columns that have mostly null values and are not as relevant

In [148]:
df.drop(columns=['Creator', 'Credit Line', 'Inscription Language', 'Manufacture Location',
                                      'Native Name', 'Site Name', 'Subject'], inplace=True)

In [149]:
df.shape

(22680, 19)

Fill null values in our original columns from the data we got from the description

In [150]:
# Fill missing values in 'Date Made' with 'Date Made d' where 'Date Made' is NaN
df['Date Made'] = df['Date Made'].fillna(df['Date Made d'])

# Fill missing values in 'Iconography' with 'Iconography d' where 'Iconography' is NaN
df['Iconography'] = df['Iconography'].fillna(df['Iconography d'])

df['Period'] = df['Period'].fillna(df['Period d'])

df['Provenience'] = df['Provenience'].fillna(df['Provenience d'])

In [151]:
df.isna().sum()

Object Number           0
image_path              0
Archaeology Area    19504
Culture Area        14991
Culture             12188
Date Made           10422
Description             0
Iconography         14782
Locus               17677
Materials             288
Object Title            0
Period              15604
Provenience           141
Section                 0
Technique           18097
Date Made d         14496
Iconography d       16970
Period d            22657
Provenience d       19605
dtype: int64

In [152]:
df.drop(columns=['Date Made d', 'Iconography d', 'Period d', 'Provenience d'], inplace=True)

Dropping some more columns that have a lot of null values and it is hard to fill them accurately. Plus they are not very useful for taxonomy

In [153]:
df.drop(columns=['Archaeology Area', 'Locus', 'Technique'], inplace=True)

And let's drop some rows that have a lot of null values

In [154]:
excluded_columns = ['Object Number', 'image_path', 'Description', 'Object Title', 'Section']

# Create a mask for rows where all columns except the excluded ones are null
mask = df.drop(columns=excluded_columns).isna().all(axis=1)

# Step 2: Filter and display rows that match the condition
rows_with_nulls_except_specified = df[mask]

# Display the result
print("\nRows where all columns except are null:")
print(rows_with_nulls_except_specified.shape[0])


Rows where all columns except are null:
0


In [155]:
df.isna().sum()

Object Number        0
image_path           0
Culture Area     14991
Culture          12188
Date Made        10422
Description          0
Iconography      14782
Materials          288
Object Title         0
Period           15604
Provenience        141
Section              0
dtype: int64

In [156]:
excluded_columns = ['Object Number', 'image_path', 'Description', 'Object Title', 'Section', 'Provenience']

# Create a mask for rows where all columns except the excluded ones are null
mask = df.drop(columns=excluded_columns).isna().all(axis=1)

# Step 2: Filter and display rows that match the condition
rows_with_nulls_except_specified = df[mask]

# Display the result
print("\nRows where all columns except are null:")
print(rows_with_nulls_except_specified.shape[0])


Rows where all columns except are null:
25


We can lose 25 data points, no problem

In [157]:
df.drop(index=df[mask].index, inplace=True)
df.shape[0]

22655

In [158]:
excluded_columns = ['Object Number', 'image_path', 'Description', 'Object Title', 'Section', 'Provenience', 'Materials']

# Create a mask for rows where all columns except the excluded ones are null
mask = df.drop(columns=excluded_columns).isna().all(axis=1)

# Step 2: Filter and display rows that match the condition
rows_with_nulls_except_specified = df[mask]

# Display the result
print("\nRows where all columns except are null:")
print(rows_with_nulls_except_specified.shape[0])


Rows where all columns except are null:
2023


We can lose 2023 data points

In [159]:
df.drop(index=df[mask].index, inplace=True)
df.shape[0]

20632

# Using mode to fill in the null values

In [160]:
# Find the number of unique values in each column and sort them
unique_values = df.nunique().sort_values()

# Display the sorted number of unique values for each column
print(unique_values)

Section             10
Culture Area        59
Period             856
Culture           1213
Materials         2448
Date Made         2582
Provenience       2596
Object Title      2803
Iconography       3304
Description      16957
image_path       20516
Object Number    20522
dtype: int64


Provenience is most commonly related to

Section: This might provide geographical context within a broader archaeological site.

In [161]:
# Step 1: Group by 'Section' to find the mode for 'Provenience'
provenience_mode = df.groupby(['Section'])['Provenience'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Provenience' values using the mode of the corresponding 'Section' group
df['Provenience'] = df.apply(
    lambda row: provenience_mode.get(row['Section'], row['Provenience']) 
    if pd.isna(row['Provenience']) else row['Provenience'],
    axis=1
)

# Optionally check how many null values remain in 'Provenience' after filling
print("\nNumber of remaining null values in 'Provenience':", df['Provenience'].isna().sum())


Number of remaining null values in 'Provenience': 0


Now let's look at materials

Material could be dependent on culture and period

Culture: The type of material used could be strongly related to the culture. For example, different cultures may have preferred materials based on their environment, technological capabilities, or artistic traditions. For instance, the Totonac culture might be associated with specific materials like Lava Stone or other local materials.

Period: The time period in which the artifact was made could also influence the material used. For example, certain materials might have been more common in specific time periods (e.g., Lava Stone in the Late Classic period).

In [162]:
# Step 1: Group by 'Period' and 'Culture' to find the mode for 'Materials'
materials_mode = df.groupby(['Period', 'Culture'])['Materials'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Materials' values using the mode of the corresponding 'Period' and 'Culture' group
df['Materials'] = df.apply(
    lambda row: materials_mode.get((row['Period'], row['Culture']), row['Materials']) 
    if pd.isna(row['Materials']) else row['Materials'],
    axis=1
)

# Optionally check how many null values remain in 'Materials' after filling
print("\nNumber of remaining null values in 'Materials':", df['Materials'].isna().sum())


Number of remaining null values in 'Materials': 229


In [163]:
# Step 1: Group by 'Provenience' to find the mode for 'Materials'
# This assumes that 'Materials' might be related to 'Provenience'
materials_mode = df.groupby(['Provenience'])['Materials'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Materials' values using the mode of the corresponding 'Provenience' group
df['Materials'] = df.apply(
    lambda row: materials_mode.get(row['Provenience'], row['Materials']) 
    if pd.isna(row['Materials']) and pd.notna(row['Provenience']) else row['Materials'],
    axis=1
)

# Optionally check how many null values remain in 'Materials' after filling
print("\nNumber of remaining null values in 'Materials':", df['Materials'].isna().sum())


Number of remaining null values in 'Materials': 41


In [164]:
# Step 1: Group by 'Date Made' to find the mode for 'Materials'
# This assumes that 'Materials' might be related to 'Date Made'
materials_mode = df.groupby(['Date Made'])['Materials'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Materials' values using the mode of the corresponding 'Date Made' group
df['Materials'] = df.apply(
    lambda row: materials_mode.get(row['Date Made'], row['Materials']) 
    if pd.isna(row['Materials']) and pd.notna(row['Date Made']) else row['Materials'],
    axis=1
)

# Optionally check how many null values remain in 'Materials' after filling
print("\nNumber of remaining null values in 'Materials':", df['Materials'].isna().sum())


Number of remaining null values in 'Materials': 26


In [165]:
# Step 1: Group by 'Iconography' to find the mode for 'Materials'
# This assumes that the 'Materials' are related to 'Iconography'
materials_mode = df.groupby(['Iconography'])['Materials'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Materials' values using the mode of the corresponding 'Iconography' group
df['Materials'] = df.apply(
    lambda row: materials_mode.get(row['Iconography'], row['Materials']) 
    if pd.isna(row['Materials']) and pd.notna(row['Iconography']) else row['Materials'],
    axis=1
)

# Optionally check how many null values remain in 'Materials' after filling
print("\nNumber of remaining null values in 'Materials':", df['Materials'].isna().sum())


Number of remaining null values in 'Materials': 26


Seems like this is the best we can do. Let's make another category called unknown

In [166]:
# Fill missing 'Materials' values with 'Unknown'
df['Materials'] = df['Materials'].fillna('Unknown')

# Check how many null values remain in 'Materials'
print("\nNumber of remaining null values in 'Materials':", df['Materials'].isna().sum())


Number of remaining null values in 'Materials': 0


In [167]:
# Check if all values in the row are not null in df
non_null_rows = df.notna().all(axis=1)

# Get the rows where nothing is null
rows_with_no_nulls = df[non_null_rows]

# Display the number of rows where nothing is null
print(f"Number of rows where nothing is null: {len(rows_with_no_nulls)}")

Number of rows where nothing is null: 53


In [168]:
df.isna().sum()

Object Number        0
image_path           0
Culture Area     12943
Culture          10140
Date Made         8374
Description          0
Iconography      12734
Materials            0
Object Title         0
Period           13556
Provenience          0
Section              0
dtype: int64

Now let's try to fill in Date Made

We can assume that Date Made is very much related to Period

In [170]:
# Step 1: Group by 'Period' to find the mode for 'Date Made'
date_made_mode = df.groupby(['Period'])['Date Made'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Date Made' values using the mode of the corresponding 'Period' group
df['Date Made'] = df.apply(
    lambda row: date_made_mode.get(row['Period'], row['Date Made']) 
    if pd.isna(row['Date Made']) else row['Date Made'],  # Fill 'Date Made', not 'Materials'
    axis=1
)

# Optionally check how many null values remain in 'Date Made' after filling
print("\nNumber of remaining null values in 'Date Made':", df['Date Made'].isna().sum())


Number of remaining null values in 'Date Made': 7108


In [171]:
# Step 1: Group by 'Culture Area' and 'Culture' to find the mode for 'Date Made'
date_made_mode = df.groupby(['Culture Area', 'Culture'])['Date Made'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Date Made' values using the mode of the corresponding 'Culture Area' and 'Culture' group
df['Date Made'] = df.apply(
    lambda row: date_made_mode.get((row['Culture Area'], row['Culture']), row['Date Made']) 
    if pd.isna(row['Date Made']) else row['Date Made'],  # Fill 'Date Made', not 'Materials'
    axis=1
)

# Optionally check how many null values remain in 'Date Made' after filling
print("\nNumber of remaining null values in 'Date Made':", df['Date Made'].isna().sum())


Number of remaining null values in 'Date Made': 5211


In [172]:
# Check if all values in the row are not null in df2
non_null_rows = df.notna().all(axis=1)

# Get the rows where nothing is null
rows_with_no_nulls = df[non_null_rows]

# Display the number of rows where nothing is null
print(f"Number of rows where nothing is null: {len(rows_with_no_nulls)}")

Number of rows where nothing is null: 97


In [173]:
df.isna().sum()

Object Number        0
image_path           0
Culture Area     12943
Culture          10140
Date Made         5211
Description          0
Iconography      12734
Materials            0
Object Title         0
Period           13556
Provenience          0
Section              0
dtype: int64

Now let's look at Period null values

In [174]:
# Step 1: Group by 'Date Made' to find the mode for 'Period'
period_mode = df.groupby(['Date Made'])['Period'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Period' values using the mode of the corresponding 'Date Made' group
df['Period'] = df.apply(
    lambda row: period_mode.get(row['Date Made'], row['Period']) 
    if pd.isna(row['Period']) else row['Period'],  # Fill 'Period', not 'Date Made'
    axis=1
)

# Optionally check how many null values remain in 'Period' after filling
print("\nNumber of remaining null values in 'Period':", df['Period'].isna().sum())


Number of remaining null values in 'Period': 8016


In [175]:
# Step 1: Group by 'Culture Area' and 'Culture' to find the mode for 'Period'
period_mode = df.groupby(['Culture Area', 'Culture'])['Period'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Period' values using the mode of the corresponding 'Culture Area' and 'Culture' group
df['Period'] = df.apply(
    lambda row: period_mode.get((row['Culture Area'], row['Culture']), row['Period']) 
    if pd.isna(row['Period']) else row['Period'],  # Fill 'Period', not 'Culture Area' or 'Culture'
    axis=1
)

# Optionally check how many null values remain in 'Period' after filling
print("\nNumber of remaining null values in 'Period':", df['Period'].isna().sum())


Number of remaining null values in 'Period': 7506


In [177]:
df.isna().sum()

Object Number        0
image_path           0
Culture Area     12943
Culture          10140
Date Made         5211
Description          0
Iconography      12734
Materials            0
Object Title         0
Period            7506
Provenience          0
Section              0
dtype: int64

Let's look at culture and culture areas now

In [178]:
# Step 1: Group by 'Culture' to find the mode for 'Culture Area'
culture_area_mode = df.groupby(['Culture'])['Culture Area'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Culture Area' values using the mode of the corresponding 'Culture' group
df['Culture Area'] = df.apply(
    lambda row: culture_area_mode.get(row['Culture'], row['Culture Area']) 
    if pd.isna(row['Culture Area']) else row['Culture Area'],  # Fill 'Culture Area', not 'Culture'
    axis=1
)

# Optionally check how many null values remain in 'Culture Area' after filling
print("\nNumber of remaining null values in 'Culture Area':", df['Culture Area'].isna().sum())


Number of remaining null values in 'Culture Area': 12560


In [179]:
# Step 1: Group by 'Culture Area' to find the mode for 'Culture'
culture_mode = df.groupby(['Culture Area'])['Culture'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Culture' values using the mode of the corresponding 'Culture Area' group
df['Culture'] = df.apply(
    lambda row: culture_mode.get(row['Culture Area'], row['Culture']) 
    if pd.isna(row['Culture']) else row['Culture'],  # Fill 'Culture', not 'Culture Area'
    axis=1
)

# Optionally check how many null values remain in 'Culture' after filling
print("\nNumber of remaining null values in 'Culture':", df['Culture'].isna().sum())


Number of remaining null values in 'Culture': 6455


In [180]:
# Check if all values in the row are not null in df2
non_null_rows = df.notna().all(axis=1)

# Get the rows where nothing is null
rows_with_no_nulls = df[non_null_rows]

# Display the number of rows where nothing is null
print(f"Number of rows where nothing is null: {len(rows_with_no_nulls)}")

Number of rows where nothing is null: 951


Let's try to fill in iconography

In [181]:
# Step 1: Group by 'Culture' and 'Period' to find the mode for 'Iconography'
iconography_mode = df.groupby(['Culture', 'Period'])['Iconography'].agg(
    lambda x: x.mode()[0] if not x.mode().empty else None
)

# Step 2: Fill missing 'Iconography' values using the mode of the corresponding 'Culture' and 'Period' group
df['Iconography'] = df.apply(
    lambda row: iconography_mode.get((row['Culture'], row['Period']), row['Iconography']) 
    if pd.isna(row['Iconography']) else row['Iconography'],
    axis=1
)

# Optionally check how many null values remain in 'Iconography' after filling
print("\nNumber of remaining null values in 'Iconography':", df['Iconography'].isna().sum())


Number of remaining null values in 'Iconography': 9364


We still have a lot of null values for iconography, let's drop and see how many rows are full

In [182]:
df.drop(columns=['Iconography'], inplace=True)

In [183]:
# Check if all values in the row are not null in df2
non_null_rows = df.notna().all(axis=1)

# Get the rows where nothing is null
rows_with_no_nulls = df[non_null_rows]

# Display the number of rows where nothing is null
print(f"Number of rows where nothing is null: {len(rows_with_no_nulls)}")

Number of rows where nothing is null: 4143


In [184]:
rows_with_no_nulls.shape

(4143, 11)

In [186]:
rows_with_no_nulls.isna().sum()

Object Number    0
image_path       0
Culture Area     0
Culture          0
Date Made        0
Description      0
Materials        0
Object Title     0
Period           0
Provenience      0
Section          0
dtype: int64

In [199]:
rows_with_no_nulls.nunique().sort_values()

Section             5
Culture Area       36
Period            117
Culture           291
Date Made         373
Provenience       927
Object Title      950
Materials        1073
Description      3730
image_path       4121
Object Number    4122
dtype: int64

In [195]:
# Drop rows where 'image_path' is duplicated (i.e., keep only unique image paths)
final_df = rows_with_no_nulls.drop_duplicates(subset='image_path', keep='first')

# Check the result
print(f"Number of rows after removing duplicates: {final_df.shape[0]}")

Number of rows after removing duplicates: 4121


In [197]:
final_df.nunique().sort_values()

Section             5
Culture Area       36
Period            117
Culture           291
Date Made         372
Provenience       927
Object Title      948
Materials        1070
Description      3721
Object Number    4100
image_path       4121
dtype: int64

In [196]:
final_df.to_csv("Cleaned.csv")