# EDA on patterns data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [None]:
patterns_df = pd.read_csv('../data/df_patterns_clean2.csv', low_memory = False)

In [None]:
pd.set_option('display.max_columns', 60)
patterns_df.head()

In [None]:
patterns_df.describe()

There are definitely outliers in the data. Some are not surprising (favorites_count, projects_count, queued_projects_count, rating_count). These are related to pattern popularity and it was already clear that some patterns are enormously popular. I expect high counts on favorites, queued, and rating to correlate with high project count.

Investigate outliers in these columns: difficulty_average, gauge, price, row_gauge, yardage, yardage_max

In [None]:
patterns_df[patterns_df['difficulty_average'] > 8]


In [None]:
patterns_df.hist(column = 'difficulty_average');
# tends somewhat to the right; not unexpected, this measure is subjective and people tend to think something
# they can do is lower on the difficulty scale

In [None]:
patterns_df[patterns_df['gauge'] > 60]
# some of the outliers are due to differing gauge_patterns. For meaningful analysis, clean up that column
# and limit gauge analysis to stockinette (widely accepted standard and what yarn manufacturers use).
# Really high outliers (99 stitches over 1 or 4 inches) are clearly errors - nobody's getting that with aran yarn.
# Possibility people don't pay attention to called-for gauge in crochet - just use the hook for the size yarn
# and go. Will definitely be easier to limit to standard gauge stitch pattern.

In [None]:
patterns_df.hist(column = 'gauge', bins = 20);

In [None]:
patterns_df[patterns_df['price'] > 50]
# high price outliers due to different currencies - to analyze pattern prices probably most useful to limit to
# USD or convert foreign currencies

In [None]:
patterns_df.currency.value_counts(dropna = False)
# USD accounts for 17785 of non-free patterns. 8962 patterns in data are free. That leaves a little over 3000
# or 10% divided among other currencies.
# MVP might be USD only - expand to include converted currencies if time allows.

In [None]:
patterns_df.hist(column = 'price', bins = 40);

In [None]:
patterns_df[patterns_df['row_gauge'] > 60]
# should just ignore row_gauge, it's not often critical to a pattern - people are mostly concerned with stitch gauge

In [None]:
patterns_df[patterns_df['yardage'] > 10000]
# biggest outlier is a crochet blanket (in thread weight!), so most likely not an error
# to get meaningful yardage analysis should compare like to like, so break out categories (could compare knitting
# to crochet yardage)

In [None]:
patterns_df.hist(column = 'yardage', bins = 60);
# definite bias towards lower-yardage projects - understandable, they get done more quickly

In [None]:
patterns_df[patterns_df['projects_count'] > 10000]

In [None]:
# count of patterns by craft
patterns_craft = pd.DataFrame(patterns_df['craft_name'].value_counts().sort_values())
patterns_craft

In [None]:
# count of patterns by craft

# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = ['Crochet', 'Knitting']
slices = [5542, 24635]
colors = ['#F3F4F0', '#EE6E62']

plt.pie(slices, labels = labels, colors = colors, startangle=90, autopct='%1.1f%%')
plt.title('Distribution of Patterns by Craft', fontname = 'Lucida Grande', fontsize = 24)

plt.savefig('../images/pattcount_craft.png');

In [None]:
# count of projects by craft
projects_craft = pd.DataFrame(patterns_df[['craft_name', 'projects_count']].groupby(['craft_name']).sum().sort_values(['projects_count']))
projects_craft

In [None]:
# count of projects by craft

# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = ['Crochet', 'Knitting']
slices = ['1434540', '9587055']
colors = ['#F3F4F0', '#EE6E62']

plt.pie(slices, labels = labels, colors = colors, startangle=90, autopct='%1.1f%%')
plt.title('Distribution of Projects by Craft', fontname = 'Lucida Grande', fontsize = 24)

plt.savefig('../images/projectcount_craft.png');

In [None]:
patterns_knitting_df = patterns_df[patterns_df['craft_name'] == 'Knitting']

In [None]:
# count of patterns by type

plt.figure(figsize = (30, 10))

# value_counts to get frequency, sort_values to put in descending order
plotdata = patterns_knitting_df['type_name'].value_counts().nlargest(5).sort_values()

# Plot a bar chart
plotdata.plot(kind = 'barh', color = '#EE6E62', fontsize = 14)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Knitting Patterns by Type', fontname = 'Lucida Grande', fontsize = 24)

# remove axis labels - feedback from Mary and Mahesh
#plt.xlabel('Count', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/pattcount_type.png', bbox_inches = 'tight');

In [None]:
project_type_df = patterns_knitting_df[['type_name', 'projects_count']].groupby(['type_name']).sum().sort_values(['projects_count']).nlargest(10, 'projects_count')
project_type_df.head()


In [None]:
# count of knitting projects by pattern type

# sum # of projects and group by type_name, sort_values to put in descending order
plotdata = patterns_knitting_df[['type_name', 'projects_count']].groupby(['type_name']).sum().nlargest(5, 'projects_count').sort_values(['projects_count'])

# Plot a bar chart
plotdata.plot(kind = 'barh', figsize = (30, 10), color = '#EE6E62', fontsize = 14, legend = None)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Knitting Projects by Type (in millions)', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count (millions)', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/projectcount_type.png', bbox_inches = 'tight');

In [None]:
patterncount_yarn = patterns_df['yarn_weight_description'].value_counts()
patterncount_yarn.head()

In [None]:
# count of called-for yarn

plt.figure(figsize = (30, 10))

# value_counts to get frequency, sort_values to put in descending order
plotdata = patterns_knitting_df['yarn_weight_description'].value_counts().nlargest(5).sort_values()

# Plot a bar chart
plotdata.plot(kind = 'barh', color = '#EE6E62', fontsize = 14)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Called-For Yarn by Yarn Weight', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/yarncount_weight.png', bbox_inches = 'tight');

In [None]:
pattern_yarn_df = patterns_knitting_df[['yarn_weight_description', 'projects_count']].groupby(['yarn_weight_description']).sum().sort_values(['projects_count'])
pattern_yarn_df

In [None]:
# count of yarn weight by projects

# group by , sort_values to put in descending order
plotdata = patterns_knitting_df[['yarn_weight_description', 'projects_count']].groupby(['yarn_weight_description']).sum().nlargest(5, 'projects_count').sort_values(['projects_count'])

# Plot a bar chart
plotdata.plot(kind = 'barh', figsize = (30, 10), color = '#EE6E62', fontsize = 14, legend = None)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Yarn Weight by Projects (in millions)', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count (million)', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/projectcount_weight.png', bbox_inches = 'tight');

In [None]:
calledfor_yarn_df = patterns_knitting_df[['type_name', 'yarn_weight_description']]
shawl_yarn_df = calledfor_yarn_df.loc[calledfor_yarn_df['type_name'] == 'Shawl/Wrap']
sock_yarn_df = calledfor_yarn_df.loc[calledfor_yarn_df['type_name'] == 'Socks']
hat_yarn_df = calledfor_yarn_df.loc[calledfor_yarn_df['type_name'] == 'Hat']
child_yarn_df = calledfor_yarn_df.loc[calledfor_yarn_df['type_name'] == 'Child']


In [None]:
# count of called-for yarn for shawls

plt.figure(figsize = (30, 10))

# value_counts to get frequency, sort_values to put in descending order
plotdata = shawl_yarn_df['yarn_weight_description'].value_counts().nlargest(5).sort_values()

# Plot a bar chart
plotdata.plot(kind = 'barh', color = '#EE6E62', fontsize = 14)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Called-For Yarn by Yarn Weight - Shawl/Wrap', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/shawlyarn_weight.png', bbox_inches = 'tight');

In [None]:
# count of called-for yarn for socks

plt.figure(figsize = (30, 10))

# value_counts to get frequency, sort_values to put in descending order
plotdata = sock_yarn_df['yarn_weight_description'].value_counts().nlargest(5).sort_values()

# Plot a bar chart
plotdata.plot(kind = 'barh', color = '#EE6E62', fontsize = 14)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Called-For Yarn by Yarn Weight - Socks', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/sockyarn_weight.png', bbox_inches = 'tight');

In [None]:
# count of called-for yarn for hats

plt.figure(figsize = (30, 10))

# value_counts to get frequency, sort_values to put in descending order
plotdata = hat_yarn_df['yarn_weight_description'].value_counts().nlargest(5).sort_values()

# Plot a bar chart
plotdata.plot(kind = 'barh', color = '#EE6E62', fontsize = 14)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Called-For Yarn by Yarn Weight - Hat', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/hatyarn_weight.png', bbox_inches = 'tight');

In [None]:
# count of called-for yarn for hats

plt.figure(figsize = (30, 10))

# value_counts to get frequency, sort_values to put in descending order
plotdata = child_yarn_df['yarn_weight_description'].value_counts().nlargest(5).sort_values()

# Plot a bar chart
plotdata.plot(kind = 'barh', color = '#EE6E62', fontsize = 14)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Called-For Yarn by Yarn Weight - Child', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/childyarn_weight.png', bbox_inches = 'tight');