# Clean Ravelry data

Remove unneeded columns.

Combine yarn dataframes and shop dataframes.

Explore a bit to see what else I need to clean.

In [1]:
import pandas as pd


In [None]:
patterns_df = pd.read_csv('../data/df_pattdetails.csv', low_memory = False)

In [None]:
pd.set_option('display.max_columns', 60)
patterns_df.head()

In [None]:
patterns_df.info()

In [None]:
patterns_df.isnull().sum()

# Looking at columns for what is contained

In [None]:
patterns_df[['pdf_url', 'url', 'product_id']]

In [None]:
print(patterns_df.download_location.values)

In [None]:
print(patterns_df.gauge_description.values)

In [None]:
# keep column, but will need to break the info out if I need to use it
# dictionary
print(patterns_df.pattern_needle_sizes.values)

In [None]:
# keep column, but will need to break the info out if I need to use it
# dictionary
# contains called-for yarn
print(patterns_df.packs.values)

In [None]:
print(patterns_df.printings.values)

In [None]:
# info on called-for yarn, some is duplicated elsewhere but don't drop the column in case it's needed

print(patterns_df.yarn_weight.values)

In [None]:
# keep column, need to break the information out of dictionary form, this one should be straightforward

print(patterns_df.craft.values)

In [None]:
# contains category information so it is needed, will have to consider best way to make it useable

print(patterns_df.pattern_categories.values)

In [None]:
# keep column, although I don't know if I'll use it. Pattern attributes can get really in the weeds.

print(patterns_df.pattern_attributes.values)

In [None]:
print(patterns_df.pattern_author.values)

In [None]:
print(patterns_df.photos.values)

In [None]:
# this is a better source for high-level category than pattern_categories

print(patterns_df.pattern_type.values)

In [None]:
patterns_df.currency.value_counts(dropna = False)

In [None]:
# not needed in addition to currency column

patterns_df.currency_symbol.value_counts(dropna = False)

In [None]:
# all values false, this is related to an individual user's library info

patterns_df.pdf_in_library.value_counts(dropna = False)

In [None]:
patterns_df.downloadable.value_counts(dropna = False)

In [None]:
# this information would be useful for machine learning project, but I don't think I'll be using it

print(patterns_df.notes.values)

# Drop columns - patterns dataframe

In [None]:
# will remember to ignore index when reading to csv in future
patterns_df = patterns_df.drop(['Unnamed: 0', 'index'], 1)

# 2 columns with no non-null values
patterns_df = patterns_df.drop(['personal_attributes', 'volumes_in_library'], 1)

# a lot of nulls in several columns that won't impact on the analysis
patterns_df = patterns_df.drop(['pdf_url', 'url', 'product_id'], 1)

# columns with info duplicated in another column
patterns_df = patterns_df.drop(['permalink', 'notes_html'], 1)

# 4 date columns, keep when it was added to the Ravelry database
patterns_df = patterns_df.drop(['generally_available', 'published', 'updated_at'], 1)

# columns with unneeded information
patterns_df = patterns_df.drop(['gauge_description', 'printings', 'pattern_author', 'photos',
                                'currency_symbol', 'download_location', 'pdf_in_library', 'notes'], 1)

# id column was duplicated when creating dataframe
patterns_df = patterns_df.drop(['id'], 1)

In [None]:
patterns_df.head()

In [None]:
patterns_df.info()

In [None]:
# save cleaned patterns dataframe to csv

patterns_df.to_csv('../data/df_patterns_clean.csv', index = False)

----------------------------

# Yarn dataframes

In [None]:
yarnlisting_df = pd.read_csv('../data/df_yarnlistings.csv')

In [None]:
yarnlisting_df.head()

In [None]:
yarnlisting_df.info()

In [None]:
# keep, but clean up values (standardize plied/Plied etc)
yarnlisting_df.texture.value_counts(dropna = False)

In [None]:
# drop, only 68 non-null values
yarnlisting_df.thread_size.value_counts(dropna = False)

In [None]:
# drop, this info would be useful if more non-null values were present
yarnlisting_df.wpi.value_counts(dropna = False)

In [None]:
# keep, will need to break useful info out of dictionary
print(yarnlisting_df.yarn_weight.values)

# Drop columns from yarn listings dataframe

In [None]:
# will remember to ignore index when reading to csv in future
yarnlisting_df = yarnlisting_df.drop(['Unnamed: 0'], 1)

# one column with no non-null values
yarnlisting_df = yarnlisting_df.drop(['personal_attributes'], 1)

# a lot of nulls in two columns that won't impact on the analysis
yarnlisting_df = yarnlisting_df.drop(['thread_size', 'wpi'], 1)

# unneeded information
yarnlisting_df = yarnlisting_df.drop(['permalink', 'first_photo'], 1)

In [None]:
yarnlisting_df.head()

In [None]:
yarnlisting_df.info()

# Yarn details dataframe

In [None]:
yarndetails_df = pd.read_csv('../data/df_yarndetails.csv')

In [None]:
yarndetails_df.head()

In [None]:
# drop columns duplicated in yarnlisting_df except yarn_id
yarndetails_df = yarndetails_df.drop(['Unnamed: 0', 'discontinued', 'gauge_divisor', 'grams', 'id',
                                      'machine_washable', 'max_gauge', 'min_gauge', 'name', 'permalink',
                                      'rating_average', 'rating_count', 'rating_total', 'texture',
                                      'thread_size', 'wpi', 'yardage', 'personal_attributes'], 1)


In [None]:
yarndetails_df.head()

In [None]:
yarndetails_df.info()

# Look at remaining columns and drop unneeded

In [None]:
print(yarndetails_df.notes_html.values)

In [None]:
# keep, but break useful info out of dictionary
# want to use metric needle size, but break out US needle size as well in case some don't have metric info
# for any that don't have metric info, can use US size to supply data
print(yarndetails_df.min_needle_size.values)

In [None]:
# keep, break out yarn weight name
print(yarndetails_df.yarn_weight.values)

In [None]:
print(yarndetails_df.yarn_company.values)

In [None]:
# keep, want to analyze yarn fiber content
# (at least at high level - animal fiber yes/no, silk/wool/alpaca/cotton etc)
# will include percentages in analysis depending on how I can make it work
print(yarndetails_df.yarn_fibers.values)

In [None]:
print(yarndetails_df.photos.values)

In [None]:
# Drop unneeded columns

yarndetails_df = yarndetails_df.drop(['notes_html', 'yarn_company', 'photos'], 1)

In [None]:
yarndetails_df.head()

In [None]:
yarndetails_df.info()

# Merge yarn dataframes using yarnlisting_df.id and yarndetails_df.yarn_id

In [None]:
yarn_df = pd.merge(yarnlisting_df, yarndetails_df,
                   left_on = 'id',
                   right_on = 'yarn_id')
yarn_df = yarn_df.drop(['id'], 1)
yarn_df.head()

In [None]:
# save cleaned yarn dataframe to csv

yarn_df.to_csv('../data/df_yarn_clean.csv', index = False)

-----------------------------------

# Shops dataframes

In [2]:
shoplisting_df = pd.read_csv('../data/df_shoplistings.csv')

In [6]:
shoplisting_df.head()

Unnamed: 0.1,Unnamed: 0,address,city,closed,facebook_page,free_wifi,id,latitude,location,longitude,name,parking,permalink,phone,pos_online,ravelry_retailer,seating,shop_email,twitter_id,url,wheelchair_access,zip,distance,country,state
0,0,5 Alabama Avenue (soon),LaFayette,False,https://www.facebook.com/courthouseyarnier/,True,6459,,"5 Alabama Avenue (soon), LaFayette, Alabama",,Courthouse Yanier,True,courthouse-yanier,910-346-6430,True,False,True,phootsy@courthouseyarnier.com,,http://www.yarnier.com,True,36862,,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"
1,1,817B Regal Drive,Huntsville,False,http://www.facebook.com/#!/pages/Fiber-Art-Work/,True,9966,34.7091,"817B Regal Drive, Huntsville, Alabama",-86.5875,Fiber Art Work,True,fiber-art-work,256-656-0163,True,True,True,fiberartwork@gmail.com,,http://www.fiberartwork.com,True,35801,,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"
2,2,"25219 Hwy 195, P.O. Box 392 (for mailing)",Double Springs,False,Fine Yarns On Main,,11655,34.1465,"25219 Hwy 195, P.O. Box 392 (for mailing), Dou...",-87.4022,Fine Yarns on Main,,fine-yarns-on-main,205-489-8009,False,True,,fineyarnsonmain@gmail.com,,http://www.ravelry.com/shops/fine-yarns-on-main,,35553,,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"
3,3,15314 Court Street,Moulton,False,,,8023,34.4825,"15314 Court Street, Moulton, Alabama",-87.2766,Granny’s Quilt Shop,,grannys-quilt-shop,256-685-3000,False,False,,,,,,35650,,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"
4,4,105 D Church Street,Madison,False,https://www.facebook.com/Hook-A-Frog-Fiber-and...,True,12262,34.6946,"105 D Church Street, Madison, Alabama",-86.7487,Hook A Frog Fiber & Fun,,hook-a-frog-fiber--fun,256-325-0572,True,False,True,hookafrog@gmail.com,,http://www.hookafrog.net,,35758,,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"


In [4]:
shoplisting_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2613 entries, 0 to 2612
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         2613 non-null   int64  
 1   address            2550 non-null   object 
 2   city               2610 non-null   object 
 3   closed             2613 non-null   bool   
 4   facebook_page      1308 non-null   object 
 5   free_wifi          1024 non-null   object 
 6   id                 2613 non-null   int64  
 7   latitude           2519 non-null   float64
 8   location           2610 non-null   object 
 9   longitude          2519 non-null   float64
 10  name               2613 non-null   object 
 11  parking            814 non-null    object 
 12  permalink          2613 non-null   object 
 13  phone              2083 non-null   object 
 14  pos_online         2613 non-null   bool   
 15  ravelry_retailer   2613 non-null   bool   
 16  seating            1297 

In [5]:
# drop, all shops listed are in business according to the database
shoplisting_df.closed.value_counts()

False    2613
Name: closed, dtype: int64

In [11]:
# keep for now even though there are apparently non-US shops - will clean those records after merge 
# with shop details df

shoplisting_df.country.value_counts()

{'id': 229, 'name': 'United States'}       2535
{'id': 39, 'name': 'Canada'}                 30
{'id': 228, 'name': 'United Kingdom'}        13
{'id': 13, 'name': 'Australia'}               9
{'id': 105, 'name': 'Ireland'}                7
{'id': 199, 'name': 'South Africa'}           4
{'id': 156, 'name': 'New Zealand'}            4
{'id': 162, 'name': 'Northern Ireland'}       4
{'id': 84, 'name': 'Germany'}                 4
{'id': 103, 'name': 'India'}                  2
{'id': 88, 'name': 'Greece'}                  1
Name: country, dtype: int64

In [9]:
shoplisting_df.free_wifi.value_counts()

True     874
False    150
Name: free_wifi, dtype: int64

In [12]:
shoplisting_df.pos_online.value_counts()

False    1919
True      694
Name: pos_online, dtype: int64

In [13]:
shoplisting_df.ravelry_retailer.value_counts()

True     1430
False    1183
Name: ravelry_retailer, dtype: int64

In [15]:
# drop unneeded columns from shop listing df

# will remember to ignore index when reading to csv in future
shoplisting_df = shoplisting_df.drop(['Unnamed: 0'], 1)

# one column with no non-null values
shoplisting_df = shoplisting_df.drop(['distance'], 1)

# a lot of nulls in two columns that won't impact on the analysis
shoplisting_df = shoplisting_df.drop(['parking', 'free_wifi', 'seating', 'twitter_id'], 1)

# unneeded information
shoplisting_df = shoplisting_df.drop(['permalink', 'phone', 'url', 'wheelchair_access', 'facebook_page',
                                      'closed'], 1)

In [20]:
shoplisting_df.head()

Unnamed: 0,address,city,id,latitude,location,longitude,name,pos_online,ravelry_retailer,shop_email,zip,country,state
0,5 Alabama Avenue (soon),LaFayette,6459,,"5 Alabama Avenue (soon), LaFayette, Alabama",,Courthouse Yanier,True,False,phootsy@courthouseyarnier.com,36862,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"
1,817B Regal Drive,Huntsville,9966,34.7091,"817B Regal Drive, Huntsville, Alabama",-86.5875,Fiber Art Work,True,True,fiberartwork@gmail.com,35801,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"
2,"25219 Hwy 195, P.O. Box 392 (for mailing)",Double Springs,11655,34.1465,"25219 Hwy 195, P.O. Box 392 (for mailing), Dou...",-87.4022,Fine Yarns on Main,False,True,fineyarnsonmain@gmail.com,35553,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"
3,15314 Court Street,Moulton,8023,34.4825,"15314 Court Street, Moulton, Alabama",-87.2766,Granny’s Quilt Shop,False,False,,35650,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"
4,105 D Church Street,Madison,12262,34.6946,"105 D Church Street, Madison, Alabama",-86.7487,Hook A Frog Fiber & Fun,True,False,hookafrog@gmail.com,35758,"{'id': 229, 'name': 'United States'}","{'id': 3596, 'name': 'Alabama'}"


In [21]:
shoplisting_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2613 entries, 0 to 2612
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   address           2550 non-null   object 
 1   city              2610 non-null   object 
 2   id                2613 non-null   int64  
 3   latitude          2519 non-null   float64
 4   location          2610 non-null   object 
 5   longitude         2519 non-null   float64
 6   name              2613 non-null   object 
 7   pos_online        2613 non-null   bool   
 8   ravelry_retailer  2613 non-null   bool   
 9   shop_email        1344 non-null   object 
 10  zip               2551 non-null   object 
 11  country           2613 non-null   object 
 12  state             2610 non-null   object 
dtypes: bool(2), float64(2), int64(1), object(8)
memory usage: 229.8+ KB


In [30]:
# examine shop listings with country listed other than US
othercountry_list = ["{'id': 39, 'name': 'Canada'}", "{'id': 228, 'name': 'United Kingdom'}", 
                     "{'id': 13, 'name': 'Australia'}", "{'id': 105, 'name': 'Ireland'}", 
                     "{'id': 199, 'name': 'South Africa'}", "{'id': 156, 'name': 'New Zealand'}", 
                     "{'id': 162, 'name': 'Northern Ireland'}", "{'id': 84, 'name': 'Germany'}", 
                     "{'id': 103, 'name': 'India'}", "{'id': 88, 'name': 'Greece'}"]

shoplisting_df.loc[shoplisting_df.country.isin(othercountry_list)]

Unnamed: 0,address,city,id,latitude,location,longitude,name,pos_online,ravelry_retailer,shop_email,zip,country,state
664,56 Main Street,Ballymoney,8409,54.9593,"56 Main Street, Ballymoney, Ballymoney, Northe...",-6.48476,A Twaddle,False,False,,BT53 6AL,"{'id': 162, 'name': 'Northern Ireland'}","{'id': 224, 'name': 'Ballymoney'}"
670,39 Main Road,"Cleeve, North Somerset",11623,51.3860,"39 Main Road, Cleeve, North Somerset, North So...",-2.78580,AlterKnit Universe (Art Equals Happy),False,True,AU@artequalshappy.co.uk,BS494NS,"{'id': 228, 'name': 'United Kingdom'}","{'id': 3493, 'name': 'North Somerset'}"
671,137-139 Main St,Fivemiletown,8475,54.3773,"137-139 Main St, Fivemiletown, Dungannon, Nort...",-7.31790,Amanda’s Wool & Craft Shop,False,False,,BT75 0PG,"{'id': 162, 'name': 'Northern Ireland'}","{'id': 220, 'name': 'Dungannon'}"
672,"324 Main Street, Unit B",Antigonish,14492,45.6226,"324 Main Street, Unit B, Antigonish, Nova Scot...",-61.99370,AntigoKnits,True,False,knitwit@antigoknits.com,B2G 2C4,"{'id': 39, 'name': 'Canada'}","{'id': 850, 'name': 'Nova Scotia'}"
700,66 South Main Street,Wexford,8482,52.3374,"66 South Main Street, Wexford, Wexford, Ireland",-6.46118,Colman Doyle,False,False,,,"{'id': 105, 'name': 'Ireland'}","{'id': 1746, 'name': 'Wexford'}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,Landstraße 6,Wörth a.Main,14596,49.7974,"Landstraße 6, Wörth a.Main, Bayern, Germany",9.15692,Wolle Straub,False,False,info@wolle-straub.de,63939,"{'id': 84, 'name': 'Germany'}","{'id': 1402, 'name': 'Bayern'}"
1482,126 - 134 Station Road,New Milton,8169,50.7518,"126 - 134 Station Road, New Milton, Hampshire,...",-1.65442,Bradbeers,False,False,,BH25 6LW,"{'id': 228, 'name': 'United Kingdom'}","{'id': 3537, 'name': 'Hampshire'}"
1591,3500 Route 635,"Harvey, York County",624,45.5917,"3500 Route 635, Harvey, York County, New Bruns...",-67.30530,Briggs & Little Outlet Shop,False,False,,E6K 1J8,"{'id': 39, 'name': 'Canada'}","{'id': 855, 'name': 'New Brunswick'}"
1674,50 York St,Sydney,527,-33.8697,"50 York St, Sydney, New South Wales, Australia",151.20600,Morris and Sons,True,True,,2000,"{'id': 13, 'name': 'Australia'}","{'id': 435, 'name': 'New South Wales'}"


In [32]:
drop_index = shoplisting_df[shoplisting_df.country.isin(othercountry_list)].index
shoplisting_df = shoplisting_df.drop(drop_index).reset_index(drop = True)

In [33]:
shoplisting_df.loc[shoplisting_df.country.isin(othercountry_list)]

Unnamed: 0,address,city,id,latitude,location,longitude,name,pos_online,ravelry_retailer,shop_email,zip,country,state


In [36]:
shoplisting_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2535 entries, 0 to 2534
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   address           2472 non-null   object 
 1   city              2532 non-null   object 
 2   id                2535 non-null   int64  
 3   latitude          2446 non-null   float64
 4   location          2532 non-null   object 
 5   longitude         2446 non-null   float64
 6   name              2535 non-null   object 
 7   pos_online        2535 non-null   bool   
 8   ravelry_retailer  2535 non-null   bool   
 9   shop_email        1313 non-null   object 
 10  zip               2483 non-null   object 
 11  country           2535 non-null   object 
 12  state             2533 non-null   object 
dtypes: bool(2), float64(2), int64(1), object(8)
memory usage: 222.9+ KB


In [38]:
# save cleaned shops dataframe to csv

shoplisting_df.to_csv('../data/df_shop_clean.csv', index = False)

# Shop details dataframe

Having looked more closely at this dataset, there are no additional details that need to be added to the other shop df.