In [526]:
import pandas as pd

In [527]:
df = pd.read_csv("Cleaned.csv")

In [528]:
df.nunique().sort_values()

Section             5
Culture Area       36
Period            117
Culture           291
Date Made         372
Provenience       927
Object Title      948
Materials        1070
Description      3721
Object Number    4100
Unnamed: 0       4121
image_path       4121
dtype: int64

In [529]:
df.shape

(4121, 12)

Some columns are not relevant for creating a taxonomy

In [530]:
df.drop(columns=['image_path', 'Unnamed: 0', 'Object Number', 'Description', 'Object Title', 'Date Made'], inplace=True)

In [531]:
df.nunique().sort_values()

Section            5
Culture Area      36
Period           117
Culture          291
Provenience      927
Materials       1070
dtype: int64

### Section

In [532]:
pd.set_option('display.max_rows', None)

In [533]:
df['Section'].value_counts()

Section
American        3936
African          124
Asian             45
European          15
Near Eastern       1
Name: count, dtype: int64

Data is too biased and won't do justice while visualization

In [534]:
df.drop(columns=['Section'], inplace=True)

### Culture Area

In [535]:
df['Culture Area'].value_counts()

Culture Area
Southwest Culture Area                                                     1440
Great Plains Culture Area                                                   601
Central American                                                            586
Northeast Culture Area                                                      372
Andean                                                                      232
Northwest Coast Culture Area                                                216
Southeast Culture Area                                                      141
California Culture Area                                                      84
Arctic Culture Area                                                          79
Subarctic Culture Area                                                       67
Great Basin Culture Area                                                     56
West Africa                                                                  50
Siberia                    

In [536]:
# Remove the words "Culture Area" from the values in the 'Culture Area' column
df['Culture Area'] = df['Culture Area'].str.replace('Culture Area', '', regex=False).str.strip()

In [537]:
df['Culture Area'].value_counts()

Culture Area
Southwest                                          1440
Great Plains                                        601
Central American                                    586
Northeast                                           372
Andean                                              232
Northwest Coast                                     216
Southeast                                           141
California                                           84
Arctic                                               79
Subarctic                                            67
Great Basin                                          56
West Africa                                          50
Siberia                                              38
Central Africa                                       28
Plateau                                              26
Amazonia                                             24
Ashanti                                              15
Great Plains  (uncertain)          

In [538]:
df['Culture Area'].value_counts().mean()

114.47222222222223

In [539]:
# Set the threshold for grouping 'Culture Area' as 'Other'
threshold = 60

# Get the value counts of 'Culture Area'
culture_area_counts = df['Culture Area'].value_counts()

# Step 1: Apply 'Other' to 'Culture Area' where the count is less than the threshold
df['Culture Area'] = df['Culture Area'].apply(
    lambda x: 'Other' if culture_area_counts[x] < threshold else x
)

In [540]:
df = df[df['Culture Area'] != 'Other']
print(df['Culture Area'].value_counts())

Culture Area
Southwest           1440
Great Plains         601
Central American     586
Northeast            372
Andean               232
Northwest Coast      216
Southeast            141
California            84
Arctic                79
Subarctic             67
Name: count, dtype: int64


In [541]:
df['Culture Area'].nunique()

10

In [542]:
df.shape

(3818, 5)

### Period

In [543]:
df['Period'].value_counts()

Period
Mousterian                                                                           1037
Roman Period                                                                          421
Iron Age | Hallstatt I | Bylany Phase                                                 395
Geometric Period                                                                      275
Birnirk                                                                               180
Cypro-Geometric IA                                                                    174
Bronze Age                                                                            170
Late Minoan I                                                                         166
Historic                                                                               78
Early Dynastic                                                                         75
Qing Dynasty | 20th Century                                                            65
Ach

In [544]:
df['Period'].value_counts().mean()

33.49122807017544

In [545]:
df['Period'] = df['Period'].str.split('|').str[0].str.strip()

In [546]:
# Set the threshold for grouping 'Culture Area' as 'Other'
threshold = 30

counts = df['Period'].value_counts()

# Step 1: Apply 'Other' to 'Culture Area' where the count is less than the threshold
df['Period'] = df['Period'].apply(
    lambda x: 'Other' if counts[x] < threshold else x
)

In [547]:
df = df[df['Period'] != 'Other']
print(df['Period'].value_counts())

Period
Mousterian            1037
Roman Period           421
Iron Age               396
Geometric Period       275
Birnirk                180
Cypro-Geometric IA     174
Bronze Age             170
Late Minoan I          166
Historic                83
Early Dynastic          75
Qing Dynasty            70
Achaemenid              63
Late Classic            61
Proto-Nasca             55
Hissar IB               52
Joseon Dynasty          42
Pueblo IV               36
Babylonian              35
Late Adena              34
Teotihuacan III         30
Name: count, dtype: int64


In [548]:
df['Period'].nunique()

20

In [549]:
df.shape

(3455, 5)

### Culture

In [550]:
df['Culture'].value_counts()

Culture
Cliff Dweller                                                             345
Woodland                                                                  203
Cocle                                                                     196
Sioux                                                                     187
Prehistoric Pueblo                                                        144
Inca                                                                      139
Anasazi                                                                   134
Hopi                                                                      121
Arapaho                                                                   102
Tlingit                                                                    92
Pueblo                                                                     84
Cherokee                                                                   77
Eskimo                                                  

In [551]:
df['Culture'] = df['Culture'].str.split('|').str[0].str.strip()

In [552]:
df['Culture'].value_counts().median()

5.0

In [553]:
# Set the threshold for grouping 'Culture Area' as 'Other'
threshold = 10

counts = df['Culture'].value_counts()

# Apply 'Other' to 'Culture Area' where the count is less than the threshold
df['Culture'] = df['Culture'].apply(
    lambda x: 'Other' if counts[x] < threshold else x
)
df['Culture'].value_counts()

Culture
Cliff Dweller            345
Other                    305
Pueblo                   217
Woodland                 204
Cocle                    196
Sioux                    188
Prehistoric Pueblo       144
Anasazi                  141
Inca                     139
Hopi                     121
Arapaho                  102
Tlingit                   96
Cherokee                  77
Eskimo                    72
Navajo                    68
Basket-Maker              65
Guatemalan                65
Pomo                      54
Crow (Culture)            51
Naskapi                   47
Apache                    41
Cakchiquel                40
Paracas                   38
Cliffdweller              36
Quiche                    35
Protohistoric Hopi        33
Huichol                   33
Maya                      26
Cheyenne (culture)        25
Plains Indian             24
Indian                    24
Ancestral Pueblo          23
Arapaho (uncertain)       21
Ojibwa                    21
Chilka

In [554]:
df = df[df['Culture'] != 'Other']
print(df['Culture'].value_counts())

Culture
Cliff Dweller            345
Pueblo                   217
Woodland                 204
Cocle                    196
Sioux                    188
Prehistoric Pueblo       144
Anasazi                  141
Inca                     139
Hopi                     121
Arapaho                  102
Tlingit                   96
Cherokee                  77
Eskimo                    72
Navajo                    68
Basket-Maker              65
Guatemalan                65
Pomo                      54
Crow (Culture)            51
Naskapi                   47
Apache                    41
Cakchiquel                40
Paracas                   38
Cliffdweller              36
Quiche                    35
Protohistoric Hopi        33
Huichol                   33
Maya                      26
Cheyenne (culture)        25
Plains Indian             24
Indian                    24
Ancestral Pueblo          23
Arapaho (uncertain)       21
Ojibwa                    21
Chilkat                   20
Hupa  

In [555]:
df['Culture'].nunique()

57

In [556]:
df.shape

(3150, 5)

### Provenience 

In [557]:
df['Provenience'].value_counts()

Provenience
Panama | Sitio Conte                                                                                                                                        124
Northeast Arizona                                                                                                                                           117
Peru | Pachacamac | Gravefield I                                                                                                                            105
Mexico (Central America)                                                                                                                                     79
Arizona                                                                                                                                                      62
Northeast Arizona | Little Colorado Area                                                                                                                     56
United States of America    

#### This can be divided into Country, Region and Site

In [558]:
# Extract 'Country', 'Region', and 'Site' from the 'Provenience' column
df[['Country', 'Region', 'Site']] = df['Provenience'].str.extract(r'([A-Za-z\s]+)\s?\|?\s?([A-Za-z\s]+)?\s?\|?\s?([A-Za-z\s]+)?')

# Strip any leading or trailing spaces from the extracted values
df['Country'] = df['Country'].str.strip()
df['Region'] = df['Region'].str.strip()
df['Site'] = df['Site'].str.strip()

Dealing with NaN values

In [559]:
# Fill NaN values with 'Unknown' for each of the new columns
df['Country'] = df['Country'].fillna('Unknown')
df['Region'] = df['Region'].fillna('Unknown')
df['Site'] = df['Site'].fillna('Unknown')

In [560]:
# Drop the 'Provenience' column as it is now redundant
df.drop(columns=['Provenience'], inplace=True)

In [561]:
df.nunique().sort_values()

Culture Area     10
Period           20
Country          51
Culture          57
Site            242
Region          264
Materials       770
dtype: int64

### Country

In [562]:
df['Country'].value_counts()

Country
United States of America    1013
Peru                         250
Northeast Arizona            211
New Mexico                   205
Arizona                      200
Guatemala                    198
Panama                       142
Mexico                       123
Alaska                       102
Canada                        92
Colorado                      84
Southeastern Utah             74
California                    59
North America                 58
Southwest Colorado            58
Great Plains                  51
Utah                          49
Montana                       42
Eastern Arizona               21
Costa Rica                    19
India                         11
Western Mexico                10
Washington                    10
Southwest United States        9
Greater Southwest              8
Dakotas                        7
Wooden                         6
Great Lakes                    5
Plains                         3
Oregon                         3
Ch

In [563]:
df['Country'].value_counts().mean()

61.76470588235294

In [564]:
# Set the threshold for grouping 'Culture Area' as 'Other'
threshold = 50

counts = df['Country'].value_counts()

# Apply 'Other' to 'Culture Area' where the count is less than the threshold
df['Country'] = df['Country'].apply(
    lambda x: 'Other' if counts[x] < threshold else x
)
df['Country'].value_counts()

Country
United States of America    1013
Peru                         250
Other                        230
Northeast Arizona            211
New Mexico                   205
Arizona                      200
Guatemala                    198
Panama                       142
Mexico                       123
Alaska                       102
Canada                        92
Colorado                      84
Southeastern Utah             74
California                    59
Southwest Colorado            58
North America                 58
Great Plains                  51
Name: count, dtype: int64

In [565]:
# Replace 'Unknown' and 'Other' with 'Unknown/Other'
df['Country'] = df['Country'].replace(['Unknown', 'Other'], 'Unknown/Other')

In [566]:
df = df[df['Country'] != 'Unknown/Other']
print(df['Country'].value_counts())

Country
United States of America    1013
Peru                         250
Northeast Arizona            211
New Mexico                   205
Arizona                      200
Guatemala                    198
Panama                       142
Mexico                       123
Alaska                       102
Canada                        92
Colorado                      84
Southeastern Utah             74
California                    59
North America                 58
Southwest Colorado            58
Great Plains                  51
Name: count, dtype: int64


In [567]:
df['Country'].nunique()

16

In [568]:
df.shape

(2920, 7)

### Region

In [569]:
df['Region'].value_counts()

Region
Unknown                         681
Southeastern Utah               169
Pachacamac                      139
Pennsylvania                    134
Sitio Conte                     124
Oklahoma                         91
Texas                            89
South Dakota                     69
Arizona                          58
Little Colorado Area             57
Alaska                           57
Southwestern Colorado            50
New Mexico                       48
Sun Temple                       43
North Carolina                   42
Wyoming                          39
Labrador                         39
Nazca District                   38
West Virginia                    34
San Juan Area                    31
Department of Quiche             25
Canyon                           25
British Columbia                 25
Zuni Pueblo                      25
Northeast Arizona                24
Florida                          22
United States of America         20
Quebec               

In [570]:
df.drop(columns = ['Region'], inplace=True)

### Materials

In [571]:
df['Materials'].value_counts()

Materials
Clay                                                                                    498
Stone                                                                                   166
Cotton                                                                                  150
Ceramic                                                                                 122
Wood                                                                                    120
Gold                                                                                    106
Flint                                                                                    91
Clay | Pigment                                                                           71
Wool                                                                                     48
Bone                                                                                     47
Chalcedony                                                            

In [572]:
# Step 1: Extract the first material from the 'Materials' column
df['Material'] = df['Materials'].apply(lambda x: x.split(' | ')[0])

In [573]:
df.drop(columns=['Materials'], inplace=True)

In [574]:
df['Material'].value_counts()

Material
Clay                          577
Wood                          327
Cotton                        213
Stone                         184
Ceramic                       153
Gold                          119
Flint                          91
Buckskin                       71
Wool                           69
Bone                           65
Yucca                          63
Chalcedony                     47
Jade                           46
Fiber                          42
Quartz                         40
Plant Fiber                    38
Silver                         35
Hide                           35
Spruce Root                    33
Chert                          29
Shell                          25
Leather                        22
Skin                           21
Deer Bone                      19
Rawhide                        16
Quartzite                      16
Glass                          16
Jasper                         14
Argillite                      13
Gourd

In [575]:
df['Material'].value_counts().mean()

12.066115702479339

In [576]:
# Set the threshold for grouping 'Culture Area' as 'Other'
threshold = 10

counts = df['Material'].value_counts()

# Apply 'Other' to 'Culture Area' where the count is less than the threshold
df['Material'] = df['Material'].apply(
    lambda x: 'Other' if counts[x] < threshold else x
)
df['Material'].value_counts()

Material
Clay           577
Other          459
Wood           327
Cotton         213
Stone          184
Ceramic        153
Gold           119
Flint           91
Buckskin        71
Wool            69
Bone            65
Yucca           63
Chalcedony      47
Jade            46
Fiber           42
Quartz          40
Plant Fiber     38
Silver          35
Hide            35
Spruce Root     33
Chert           29
Shell           25
Leather         22
Skin            21
Deer Bone       19
Quartzite       16
Glass           16
Rawhide         16
Jasper          14
Argillite       13
Ivory           11
Gourd           11
Name: count, dtype: int64

In [577]:
df = df[df['Material'] != 'Other']
print(df['Material'].value_counts())

Material
Clay           577
Wood           327
Cotton         213
Stone          184
Ceramic        153
Gold           119
Flint           91
Buckskin        71
Wool            69
Bone            65
Yucca           63
Chalcedony      47
Jade            46
Fiber           42
Quartz          40
Plant Fiber     38
Silver          35
Hide            35
Spruce Root     33
Chert           29
Shell           25
Leather         22
Skin            21
Deer Bone       19
Quartzite       16
Glass           16
Rawhide         16
Jasper          14
Argillite       13
Ivory           11
Gourd           11
Name: count, dtype: int64


In [578]:
df['Material'].nunique()

31

In [579]:
df.shape

(2461, 6)

In [580]:
df.nunique().sort_values()

Culture Area     10
Country          16
Period           20
Material         31
Culture          54
Site            207
dtype: int64

In [581]:
df.drop(columns=['Site'], inplace=True)

In [582]:
df.nunique().sort_values()

Culture Area    10
Country         16
Period          20
Material        31
Culture         54
dtype: int64

In [583]:
df.shape

(2461, 5)

In [584]:
# Group by the columns in the sunburst path and count rows
grouped_data = df.groupby(['Culture Area', 'Country', 'Period', 'Material', 'Culture']).size().reset_index(name='Data Points Count')

In [585]:
grouped_data['Data Points Count'].mean()

3.5718432510885343

In [586]:
grouped_data.shape

(689, 6)

In [587]:
filtered_data = grouped_data[grouped_data['Data Points Count'] >= 15]

In [588]:
filtered_data['Data Points Count'].sum()

1027

In [589]:
filtered_data['Data Points Count'].max()

101

In [590]:
filtered_data['Data Points Count'].median()

21.5

In [591]:
filtered_data.to_csv("Datasets/grouped_and_filtered_data_for_taxonomyy.csv")