# Clean Ravelry data

Remove unneeded columns.

Combine yarn dataframes and shop dataframes.

Explore a bit to see what else I need to clean.

In [None]:
import pandas as pd


In [None]:
patterns_df = pd.read_csv('../data/df_pattdetails.csv', low_memory = False)

In [None]:
pd.set_option('display.max_columns', 60)
patterns_df.head()

In [None]:
patterns_df.info()

In [None]:
patterns_df.isnull().sum()

# Looking at columns for what is contained

In [None]:
patterns_df[['pdf_url', 'url', 'product_id']]

In [None]:
print(patterns_df.download_location.values)

In [None]:
print(patterns_df.gauge_description.values)

In [None]:
# keep column, but will need to break the info out if I need to use it
# dictionary
print(patterns_df.pattern_needle_sizes.values)

In [None]:
# keep column, but will need to break the info out if I need to use it
# dictionary
# contains called-for yarn
print(patterns_df.packs.values)

In [None]:
print(patterns_df.printings.values)

In [None]:
# info on called-for yarn, some is duplicated elsewhere but don't drop the column in case it's needed

print(patterns_df.yarn_weight.values)

In [None]:
# keep column, need to break the information out of dictionary form, this one should be straightforward

print(patterns_df.craft.values)

In [None]:
# contains category information so it is needed, will have to consider best way to make it useable

print(patterns_df.pattern_categories.values)

In [None]:
# keep column, although I don't know if I'll use it. Pattern attributes can get really in the weeds.

print(patterns_df.pattern_attributes.values)

In [None]:
print(patterns_df.pattern_author.values)

In [None]:
print(patterns_df.photos.values)

In [None]:
# this is a better source for high-level category than pattern_categories

print(patterns_df.pattern_type.values)

In [None]:
patterns_df.currency.value_counts(dropna = False)

In [None]:
# not needed in addition to currency column

patterns_df.currency_symbol.value_counts(dropna = False)

In [None]:
# all values false, this is related to an individual user's library info

patterns_df.pdf_in_library.value_counts(dropna = False)

In [None]:
patterns_df.downloadable.value_counts(dropna = False)

In [None]:
# this information would be useful for machine learning project, but I don't think I'll be using it

print(patterns_df.notes.values)

# Drop columns - patterns dataframe

In [None]:
# will remember to ignore index when reading to csv in future
patterns_df = patterns_df.drop(['Unnamed: 0', 'index'], 1)

# 2 columns with no non-null values
patterns_df = patterns_df.drop(['personal_attributes', 'volumes_in_library'], 1)

# a lot of nulls in several columns that won't impact on the analysis
patterns_df = patterns_df.drop(['pdf_url', 'url', 'product_id'], 1)

# columns with info duplicated in another column
patterns_df = patterns_df.drop(['permalink', 'notes_html'], 1)

# 4 date columns, keep when it was added to the Ravelry database
patterns_df = patterns_df.drop(['generally_available', 'published', 'updated_at'], 1)

# columns with unneeded information
patterns_df = patterns_df.drop(['gauge_description', 'printings', 'pattern_author', 'photos',
                                'currency_symbol', 'download_location', 'pdf_in_library', 'notes', 'yardage_description'], 1)

# id column was duplicated when creating dataframe
patterns_df = patterns_df.drop(['id'], 1)

In [None]:
patterns_df.head()

In [None]:
patterns_df.info()

In [None]:
# save cleaned patterns dataframe to csv

patterns_df.to_csv('../data/df_patterns_clean.csv', index = False)

----------------------------

# Yarn dataframes

In [None]:
yarnlisting_df = pd.read_csv('../data/df_yarnlistings.csv')

In [None]:
yarnlisting_df.head()

In [None]:
yarnlisting_df.info()

In [None]:
# keep, but clean up values (standardize plied/Plied etc)
yarnlisting_df.texture.value_counts(dropna = False)

In [None]:
# drop, only 68 non-null values
yarnlisting_df.thread_size.value_counts(dropna = False)

In [None]:
# drop, this info would be useful if more non-null values were present
yarnlisting_df.wpi.value_counts(dropna = False)

In [None]:
# keep, will need to break useful info out of dictionary
print(yarnlisting_df.yarn_weight.values)

# Drop columns from yarn listings dataframe

In [None]:
# will remember to ignore index when reading to csv in future
yarnlisting_df = yarnlisting_df.drop(['Unnamed: 0'], 1)

# one column with no non-null values
yarnlisting_df = yarnlisting_df.drop(['personal_attributes'], 1)

# a lot of nulls in two columns that won't impact on the analysis
yarnlisting_df = yarnlisting_df.drop(['thread_size', 'wpi'], 1)

# unneeded information
yarnlisting_df = yarnlisting_df.drop(['permalink', 'first_photo'], 1)

In [None]:
yarnlisting_df.head()

In [None]:
yarnlisting_df.info()

# Yarn details dataframe

In [None]:
yarndetails_df = pd.read_csv('../data/df_yarndetails.csv')

In [None]:
yarndetails_df.head()

In [None]:
# drop columns duplicated in yarnlisting_df except yarn_id
yarndetails_df = yarndetails_df.drop(['Unnamed: 0', 'discontinued', 'gauge_divisor', 'grams', 'id',
                                      'machine_washable', 'max_gauge', 'min_gauge', 'name', 'permalink',
                                      'rating_average', 'rating_count', 'rating_total', 'texture',
                                      'thread_size', 'wpi', 'yardage', 'personal_attributes'], 1)


In [None]:
yarndetails_df.head()

In [None]:
yarndetails_df.info()

# Look at remaining columns and drop unneeded

In [None]:
print(yarndetails_df.notes_html.values)

In [None]:
# keep, but break useful info out of dictionary
# want to use metric needle size, but break out US needle size as well in case some don't have metric info
# for any that don't have metric info, can use US size to supply data
print(yarndetails_df.min_needle_size.values)

In [None]:
# keep, break out yarn weight name
print(yarndetails_df.yarn_weight.values)

In [None]:
print(yarndetails_df.yarn_company.values)

In [None]:
# keep, want to analyze yarn fiber content
# (at least at high level - animal fiber yes/no, silk/wool/alpaca/cotton etc)
# will include percentages in analysis depending on how I can make it work
print(yarndetails_df.yarn_fibers.values)

In [None]:
print(yarndetails_df.photos.values)

In [None]:
# Drop unneeded columns

yarndetails_df = yarndetails_df.drop(['notes_html', 'yarn_company', 'photos'], 1)

In [None]:
yarndetails_df.head()

In [None]:
yarndetails_df.info()

# Merge yarn dataframes using yarnlisting_df.id and yarndetails_df.yarn_id

In [None]:
yarn_df = pd.merge(yarnlisting_df, yarndetails_df,
                   left_on = 'id',
                   right_on = 'yarn_id')
yarn_df = yarn_df.drop(['id'], 1)
yarn_df.head()

In [None]:
# save cleaned yarn dataframe to csv

yarn_df.to_csv('../data/df_yarn_clean.csv', index = False)

-----------------------------------

# Shops dataframes

In [None]:
shoplisting_df = pd.read_csv('../data/df_shoplistings.csv')

In [None]:
shoplisting_df.head()

In [None]:
shoplisting_df.info()

In [None]:
# drop, all shops listed are in business according to the database
shoplisting_df.closed.value_counts()

In [None]:
# keep for now even though there are apparently non-US shops - will clean those records after merge 
# with shop details df

shoplisting_df.country.value_counts()

In [None]:
shoplisting_df.free_wifi.value_counts()

In [None]:
shoplisting_df.pos_online.value_counts()

In [None]:
shoplisting_df.ravelry_retailer.value_counts()

In [None]:
# drop unneeded columns from shop listing df

# will remember to ignore index when reading to csv in future
shoplisting_df = shoplisting_df.drop(['Unnamed: 0'], 1)

# one column with no non-null values
shoplisting_df = shoplisting_df.drop(['distance'], 1)

# a lot of nulls in two columns that won't impact on the analysis
shoplisting_df = shoplisting_df.drop(['parking', 'free_wifi', 'seating', 'twitter_id'], 1)

# unneeded information
shoplisting_df = shoplisting_df.drop(['permalink', 'phone', 'url', 'wheelchair_access', 'facebook_page',
                                      'closed'], 1)

In [None]:
shoplisting_df.head()

In [None]:
shoplisting_df.info()

In [None]:
# examine shop listings with country listed other than US
othercountry_list = ["{'id': 39, 'name': 'Canada'}", "{'id': 228, 'name': 'United Kingdom'}", 
                     "{'id': 13, 'name': 'Australia'}", "{'id': 105, 'name': 'Ireland'}", 
                     "{'id': 199, 'name': 'South Africa'}", "{'id': 156, 'name': 'New Zealand'}", 
                     "{'id': 162, 'name': 'Northern Ireland'}", "{'id': 84, 'name': 'Germany'}", 
                     "{'id': 103, 'name': 'India'}", "{'id': 88, 'name': 'Greece'}"]

shoplisting_df.loc[shoplisting_df.country.isin(othercountry_list)]

In [None]:
drop_index = shoplisting_df[shoplisting_df.country.isin(othercountry_list)].index
shoplisting_df = shoplisting_df.drop(drop_index).reset_index(drop = True)

In [None]:
shoplisting_df.loc[shoplisting_df.country.isin(othercountry_list)]

In [None]:
shoplisting_df.info()

In [None]:
# save cleaned shops dataframe to csv

shoplisting_df.to_csv('../data/df_shop_clean.csv', index = False)

# Shop details dataframe

Having looked more closely at this dataset, there are no additional details that need to be added to the other shop df.