# Cleaning with Pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # exciting, we get to look at some visualisation!
import seaborn as sns 

### 1. Load `rock.csv` and do an initial examination of its data columns.

In [2]:
# load the data 
# add your code below
# df = pd.read_csv ...

df = pd.read_csv("data/rock.csv")
df.head()



In [3]:
# look at the information regarding its columns
# add your code below

df.info()



### 2.  Clean up the column names.

Clean up the column names. Remove spaces and capitals.

In [4]:
# look at the information regarding its columns
# add your code below

rename_map = {
    # Original column: [renamed column]
    'Song Clean':    'song', 
    'ARTIST CLEAN':  'artist', 
    'Release Year':  'release', 
    'COMBINED':      'song_artist', 
    'First?':        'first', 
    'Year?':         'year', 
    'PlayCount':     'playcount', 
    'F*G':           'fg'
}

df.rename(columns=rename_map, inplace=True)
df.head(4)



### 3. How many missing values are there?

Subset on the rows containing missing release years using the command `.isnull()` on the release column and the resulting boolean list for filtering.

In [5]:
# look at the information regarding its columns
# add your code below

print(df.isnull().sum().sum())
display(df[df['release'].isnull()].head())



### 4. Find why the `release` column coded as object type.

In [6]:
# add your code below

df.release.unique()



### 5. Convert to a more appropriate format.

> Hint: Use a try-except statement, see [here](http://www.pythonforbeginners.com/error-handling/python-try-and-except) or [here](https://www.programiz.com/python-programming/exception-handling).

In [15]:
# add your code below
# you can define a function and apply with .map()
# def convert_to_float(x):...

def convert_to_float(x):
    try:
        return float(x)
    except:
        print(f'{x} cannot be converted to float')
        return np.nan

df['release_float'] = df['release'].map(convert_to_float)



### 6. Obtain summary statistics for the converted release column.

What do you notice? Any irregularities hmmm? What would you suggest we do?

> Hint: look at the releases...

In [8]:
# find the 'odd' values here
# add your code below

print(df['release_float'].describe())
print('----'*8)
print(df.loc[1504, 'release_float'])
print(df.loc[547, 'release_float'])
print('----'*8)
display(df[df['release_float'] < 1950])



In [9]:
# fix the 'odd' values here by overwriting them
# add your code below

df.loc[1504, 'release_float'] = 1972
df.loc[547, 'release_float'] = 1971



### 7. What is the role of the year column? How does it relate to the release year?

In [10]:
# insert answer below. have you got proof?

print(df[df.year==0]['release'].unique())
print(df[df.year==1]['release_float'].unique())

# year = 1 indicates the the year release in populated.



### 8. Plot the distributions of the release year and playcount

> Hint: use the `.hist` on `release_float` and `playcount`

In [11]:
# add your code below

df[['release_float', 'playcount']].hist(figsize=(12, 4));



### 9. Plot the playcount versus the release year
> Hint: use the `sns.scatterplot()` making x=`release_float`, y=`playcount` and your data=df

for documentation check [here](https://seaborn.pydata.org/generated/seaborn.scatterplot.html)

In [12]:
# add your code below
# sns.scatterplot(...

sns.scatterplot(x='release_float', y='playcount', data=df)
plt.show()



### 10. Which 10 years have the most releases?

In [13]:
# add your code below
# if only there was a way we could count how many time each value has occured
#hint

df.release_float.value_counts().iloc[:10]

#/hint

### 11. Which artists have the most songs?

In [14]:
# add your code below
#hint

df['artist'].value_counts()[:10]

#/hint