# Import Libraries

In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, json

from matplotlib.ticker import StrMethodFormatter
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore')

# Load Data

In [2]:
df = pd.read_csv("Data/replytj-collection-20230727-1508.csv")
df

Unnamed: 0,Catalog#,Artist,Title,Label,Format,Rating,Released,release_id,CollectionFolder,Date Added,Collection Media Condition,Collection Sleeve Condition,Collection Notes
0,180GWALP01,DJ Yoshizawa Dynamite.jp & Chintam,Wamono A To Z Vol. I (Japanese Jazz Funk & Rar...,180g,"LP, Comp, 180",,2020,15796599,Japanese,2022-02-07 21:13:40,Mint (M),Mint (M),
1,180GWALP02,DJ Yoshizawa Dynamite.jp & Chintam,Wamono A To Z Vol. II (Japanese Funk 1970​-​1977),180g,"LP, Comp",,2021,17347705,Japanese,2021-04-03 21:33:59,Mint (M),Mint (M),
2,TC 2488,Carl Carlton,"She's A Bad Mama Jama (She's Built, She's Stac...",20th Century Fox Records,"7"", Single, 4 P",,1981,779460,45 Funk/Soul,2019-10-14 01:15:52,Very Good (VG),Very Good (VG),
3,TC-2069,Love Unlimited Orchestra,Love's Theme,20th Century Records,"7"", Single, Styrene, Pit",,1973,16110429,45 Funk/Soul,2023-06-11 00:56:57,Very Good (VG),Generic,
4,FP-1248,Fausto Rey,El Cuarto Album,4 Points,"LP, Album",,1973,8235018,Latin Funk,2019-11-30 23:12:30,Near Mint (NM or M-),Very Good Plus (VG+),
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1252,"XL LP 450, YT031LP",The XX,XX,"XL Recordings, Young Turks","LP, Album",,2009,1976385,Indie/Alternative,2018-08-13 22:25:23,Very Good Plus (VG+),Very Good Plus (VG+),
1253,B0033811-01,The Weeknd,House Of Balloons,"XO, Republic Records, UMe","2xLP, Ltd, Mixtape, RE, Dec",,2022,22353862,Modern R&B,2022-03-14 20:01:22,Mint (M),Mint (M),
1254,YTLP060,SBTRKT,SBTRKT,Young Turks,"LP, Album",,2011,2959170,Electronic,2018-08-13 22:27:40,Very Good Plus (VG+),Very Good Plus (VG+),
1255,MIR100740,Scientist,Scientist Encounters Pac-Man,Мирумир,"LP, Album, RE",,2014,6532400,Reggae,2023-04-01 23:12:39,Mint (M),Mint (M),


# Explore Data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1257 entries, 0 to 1256
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Catalog#                     1257 non-null   object 
 1   Artist                       1257 non-null   object 
 2   Title                        1257 non-null   object 
 3   Label                        1257 non-null   object 
 4   Format                       1257 non-null   object 
 5   Rating                       0 non-null      float64
 6   Released                     1257 non-null   int64  
 7   release_id                   1257 non-null   int64  
 8   CollectionFolder             1257 non-null   object 
 9   Date Added                   1257 non-null   object 
 10  Collection Media Condition   1239 non-null   object 
 11  Collection Sleeve Condition  1220 non-null   object 
 12  Collection Notes             1 non-null      object 
dtypes: float64(1), int

# Check for Duplicates

In [4]:
df.duplicated().sum()

2

In [5]:
df = df.drop_duplicates()

In [6]:
df.duplicated().sum()

0

# Check for Missing Values

In [7]:
# Check for missing values
df.isna().sum()

Catalog#                          0
Artist                            0
Title                             0
Label                             0
Format                            0
Rating                         1255
Released                          0
release_id                        0
CollectionFolder                  0
Date Added                        0
Collection Media Condition       18
Collection Sleeve Condition      37
Collection Notes               1254
dtype: int64

In [8]:
df['Rating'].fillna('Unknown', inplace=True)

In [9]:
df['Collection Notes'].fillna('Unknown', inplace=True)

In [10]:
df['Collection Media Condition'].fillna('Unknown', inplace=True)

In [11]:
df['Collection Sleeve Condition'].fillna('Unknown', inplace=True)

In [12]:
df.isna().sum()

Catalog#                       0
Artist                         0
Title                          0
Label                          0
Format                         0
Rating                         0
Released                       0
release_id                     0
CollectionFolder               0
Date Added                     0
Collection Media Condition     0
Collection Sleeve Condition    0
Collection Notes               0
dtype: int64

# Fix any inconsistent categories of data (example: fix cat, Cat, and cats so that they are consistent).

In [13]:
# Identify Object Data Types
data_types = df.dtypes
object_data_types = data_types[(data_types == "object")]
object_data_types

Catalog#                       object
Artist                         object
Title                          object
Label                          object
Format                         object
Rating                         object
CollectionFolder               object
Date Added                     object
Collection Media Condition     object
Collection Sleeve Condition    object
Collection Notes               object
dtype: object

In [14]:
object_data_types.index

Index(['Catalog#', 'Artist', 'Title', 'Label', 'Format', 'Rating',
       'CollectionFolder', 'Date Added', 'Collection Media Condition',
       'Collection Sleeve Condition', 'Collection Notes'],
      dtype='object')

In [16]:
#Examine unique objects in columns
for column in object_data_types.index:
    print(column)
    print(df[column].unique())
    print('\n')

Catalog#
['180GWALP01' '180GWALP02' 'TC 2488' ... 'YTLP060' 'MIR100740' 'MIR100741']


Artist
['DJ Yoshizawa Dynamite.jp & Chintam' 'Carl Carlton'
 'Love Unlimited Orchestra' 'Fausto Rey' 'Raphael Saadiq & Q-Tip / Q-Tip'
 'Cocteau Twins' 'Eric B. & Rakim' 'Ben Folds Five' 'Ginuwine'
 'quickly, quickly' 'Augustus Pablo'
 'Hollywood Disco Jazz Band Featuring The Waters' 'Carpenters'
 'Quincy Jones' 'Brothers Johnson' 'The Police' 'Janet Jackson'
 'George Benson' 'Kalapana' 'Dilated Peoples' 'Rufus' 'Ramp (3)'
 'The Grass Roots' 'Dynamic Corvettes' 'Sam Cooke' 'Various' 'Dr. Dre'
 'Carlos Malcolm' 'Lynda Dawn' 'Joyce Wrice & Kay Franklin'
 'The Gangsters (6)' 'The Legendary Beyons' 'Slickaphonic' 'Transport'
 'Lights Out (19)' 'Milton Davis' 'Tito Allen'
 'Hiroshi Sato Featuring Wendy Matthews' 'Nova (81)' 'Mike Lundy'
 'Esther Williams / The Soul Searchers' 'Clarence Reid'
 'The Beginning Of The End' 'Betty Wright' 'Joyce' 'DJ Kool' 'Shadow (11)'
 'Gilberto Sextet' 'The Uniques' 'Title F

In [17]:
print(df['Collection Media Condition'].unique())

['Mint (M)' 'Very Good (VG)' 'Near Mint (NM or M-)' 'Very Good Plus (VG+)'
 'Good Plus (G+)' 'Unknown' 'Poor (P)' 'Good (G)']


In [18]:
df.replace({'Mint (M)': 'Mint', 'Very Good (VG)': 'Very Good', 'Near Mint (NM or M-)': 'Near Mint', 'Very Good Plus (VG+)': 'Very Good', 'Good Plus (G+)': 'Good Plus', 'Poor (P)': 'Poor', 'Good (G)': 'Good'}, inplace = True)

In [19]:
df['Collection Media Condition'].value_counts()

Mint         618
Near Mint    356
Very Good    210
Good Plus     40
Unknown       18
Good          12
Poor           1
Name: Collection Media Condition, dtype: int64

In [20]:
print(df['Collection Sleeve Condition'].unique())

['Mint' 'Very Good' 'Generic' 'Near Mint' 'Good Plus' 'Unknown' 'Good'
 'Poor' 'Fair (F)' 'No Cover']


In [21]:
df['Collection Sleeve Condition'].value_counts()

Mint         569
Near Mint    270
Generic      182
Very Good    145
Good Plus     37
Unknown       37
Good          10
Fair (F)       3
Poor           1
No Cover       1
Name: Collection Sleeve Condition, dtype: int64

In [22]:
df.describe(include="number")

Unnamed: 0,Released,release_id
count,1255.0,1255.0
mean,1908.882072,10054090.0
std,428.330951,7525037.0
min,0.0,37009.0
25%,1980.0,2932262.0
50%,2015.0,9258585.0
75%,2019.0,14913900.0
max,2023.0,27685690.0


### Drop 'Collection Notes' and 'Rating' from dataset

In [23]:
df.drop(columns='Collection Notes', inplace=True)
df.head(3)

Unnamed: 0,Catalog#,Artist,Title,Label,Format,Rating,Released,release_id,CollectionFolder,Date Added,Collection Media Condition,Collection Sleeve Condition
0,180GWALP01,DJ Yoshizawa Dynamite.jp & Chintam,Wamono A To Z Vol. I (Japanese Jazz Funk & Rar...,180g,"LP, Comp, 180",Unknown,2020,15796599,Japanese,2022-02-07 21:13:40,Mint,Mint
1,180GWALP02,DJ Yoshizawa Dynamite.jp & Chintam,Wamono A To Z Vol. II (Japanese Funk 1970​-​1977),180g,"LP, Comp",Unknown,2021,17347705,Japanese,2021-04-03 21:33:59,Mint,Mint
2,TC 2488,Carl Carlton,"She's A Bad Mama Jama (She's Built, She's Stac...",20th Century Fox Records,"7"", Single, 4 P",Unknown,1981,779460,45 Funk/Soul,2019-10-14 01:15:52,Very Good,Very Good


In [24]:
df.drop(columns='Rating', inplace=True)
df.head(3)

Unnamed: 0,Catalog#,Artist,Title,Label,Format,Released,release_id,CollectionFolder,Date Added,Collection Media Condition,Collection Sleeve Condition
0,180GWALP01,DJ Yoshizawa Dynamite.jp & Chintam,Wamono A To Z Vol. I (Japanese Jazz Funk & Rar...,180g,"LP, Comp, 180",2020,15796599,Japanese,2022-02-07 21:13:40,Mint,Mint
1,180GWALP02,DJ Yoshizawa Dynamite.jp & Chintam,Wamono A To Z Vol. II (Japanese Funk 1970​-​1977),180g,"LP, Comp",2021,17347705,Japanese,2021-04-03 21:33:59,Mint,Mint
2,TC 2488,Carl Carlton,"She's A Bad Mama Jama (She's Built, She's Stac...",20th Century Fox Records,"7"", Single, 4 P",1981,779460,45 Funk/Soul,2019-10-14 01:15:52,Very Good,Very Good
