# EDA on yarn data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [None]:
yarns_df = pd.read_csv('../data/df_yarn_clean2.csv')

In [None]:
pd.set_option('display.max_columns', 60)
yarns_df.head()

In [None]:
yarns_df.describe()

In [None]:
yarns_df.info()

In [None]:
# count of discontinued yarns
yarns_df['discontinued'].value_counts()


In [None]:
# count of discontinued vs current yarns

# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = ['In Production', 'Discontinued']
slices = [6172, 3828]
colors = ['#EE6E62', '#F3F4F0']

plt.pie(slices, labels = labels, colors = colors, startangle=90, autopct='%1.1f%%')
plt.title('Discontinued vs In Production Yarns', fontname = 'Lucida Grande', fontsize = 24)

plt.savefig('../images/disc_yarn.png', bbox_inches = 'tight');

In [None]:
# was going to analyze just currently available yarn, but I think will get a more accurate picture from all data
yarns_current_df = yarns_df.loc[yarns_df['discontinued'] == False]
yarns_current_df.head()

In [None]:
yarns_current_df.describe()

In [None]:
yarns_df['machine_washable'].value_counts(dropna = False)

In [None]:
# I can probably replace null values in machine-washable with False, but wait until more EDA is done to be sure
nullmachine_df = yarns_df.loc[yarns_df['machine_washable'].isna()]
nullmachine_df.head()

In [None]:
# null machine-washable yarns by fiber type

nullmachine_list = nullmachine_df['fiber_catt_name'].value_counts()
nullmachine_list

In [None]:
# do this better when I have time

yarns_df['machine_washable'] = yarns_df['machine_washable'].fillna(False)
yarns_df.head()

In [None]:
yarns_df['machine_washable'].value_counts(dropna = False)

In [None]:
# machine washability

# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = ['Machine Washable','Not Machine Washable']
slices = [5215, 4785]
colors = ['#EE6E62', '#F3F4F0']

plt.pie(slices, labels = labels, colors = colors, startangle = 90, autopct = '%1.1f%%')
plt.title('Distribution of Machine Washable vs Non Machine Washable Yarns', fontname = 'Lucida Grande', fontsize = 24)

plt.savefig('../images/wash_yarn.png', bbox_inches = 'tight');

In [None]:
machineyarn_df = yarns_df.loc[yarns_df['machine_washable'] == True]
nonmachineyarn_df = yarns_df.loc[yarns_df['machine_washable'] == False]

In [None]:
# count of machine washable by weight

plt.figure(figsize = (30, 10))

# value_counts to get frequency, sort_values to put in descending order
plotdata = machineyarn_df['yarn_weight'].value_counts().nlargest(5).sort_values()

# Plot a bar chart
plotdata.plot(kind = 'barh', color = '#EE6E62', fontsize = 14)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Machine Washable Yarns by Yarn Weight', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/machine_weight.png', bbox_inches = 'tight');

In [None]:
# count of non machine washable by weight

plt.figure(figsize = (30, 10))

# value_counts to get frequency, sort_values to put in descending order
plotdata = nonmachineyarn_df['yarn_weight'].value_counts().nlargest(5).sort_values()

# Plot a bar chart
plotdata.plot(kind = 'barh', color = '#EE6E62', fontsize = 14)

#plt.xticks(rotation = 80, horizontalalignment = 'center')
plt.title('Count of Non Machine Washable Yarns by Yarn Weight', fontname = 'Lucida Grande', fontsize = 24)
#plt.xlabel('Count', fontname = 'Lucida Grande', fontsize = 16)
#plt.ylabel('Category', fontname = 'Lucida Grande', fontsize = 16)

plt.savefig('../images/nonmachine_weight.png', bbox_inches = 'tight');