# Text Analysis of Titles Descriptions and Tags

In [1]:
import pandas as pd
import numpy as np
import re
import datetime  
import matplotlib.pyplot as plt
import seaborn as sns

#natural language toolkit for list of stop words
import nltk 
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [2]:
# combine all three datasets together for text analysis
# import the datasets
df = pd.read_csv('data/mergedData022522clean.csv')
df2 = pd.read_csv('data/mergedData033122clean.csv')
df3 = pd.read_csv('data/mergedData041022clean.csv')

In [87]:
# append the first dataframe with the other two sets of data
data = df.append([df2, df3])

In [88]:
# remove duplicate listings, keep the most recent
data = data[data.duplicated(subset='listing_id', keep='first')==False]

In [89]:
# reset the index
data = data.reset_index(drop=True)

## Titles and Descriptions

In [72]:
#make all title and descriptions lowercase
title = data['title_listing'].str.lower()
desc = data['description'].str.lower()

### Listings that contain specific products of interest

In [55]:
# function that returns percentage of listings that contain a specific word

def percentage_item(text, word):
    true = text[text.str.contains(' ' + word + ' ')==True]
    percent = len(true)/len(text)
    return percent

In [22]:
# percentage of listings that contain a specific product in their title and description

# I chose these items because I was interested in them
items = ['mug', 'plate', 'bowl', 'jar', 'planter', 'berry bowl', 'butter dish', 'cup', 'tumbler', 'bacon cooker', 'spoon rest']

for i in items:
    print("The word '" + i + "' is in " + "{:.2%}".format(percentage_item(title, i)) + " of titles and " + "{:.2%}".format(percentage_item(desc, i)) + " of descriptions.")


The word 'mug' is in 11.44% of titles and 19.69% of descriptions.
The word 'plate' is in 2.87% of titles and 5.72% of descriptions.
The word 'bowl' is in 8.06% of titles and 13.56% of descriptions.
The word 'jar' is in 1.51% of titles and 1.77% of descriptions.
The word 'planter' is in 1.47% of titles and 1.87% of descriptions.
The word 'berry bowl' is in 0.10% of titles and 0.14% of descriptions.
The word 'butter dish' is in 0.20% of titles and 0.21% of descriptions.
The word 'cup' is in 6.18% of titles and 10.43% of descriptions.
The word 'tumbler' is in 0.96% of titles and 1.53% of descriptions.
The word 'bacon cooker' is in 0.00% of titles and 0.02% of descriptions.
The word 'spoon rest' is in 0.51% of titles and 1.13% of descriptions.


### Remove punctuation, special characters and stop words from the titles and descriptons

In [73]:
# function that removes "\n", links, as well as special characters and numbers from text
def clean_text(text):
    for i in range (0, len(text)-1):
        n = '\n'
        string = text[i]
        if n in string:
            string = string.replace(n, " ")
            text[i] = string
        text[i] = re.sub("[^A-za-z]+"," ", text[i])
        text[i] = re.sub(r"http\S+","",text[i])
        text[i] = re.sub(r"www\S+","",text[i])

In [77]:
# Clean descriptions and titles
clean_text(desc)
clean_text(title)

In [78]:
#split the title strings on the spaces
title = title.str.split(" ")
desc = desc.str.split(" ")

In [79]:
## define the stopwords
stop = stopwords.words('english')

In [80]:
# add pottery-related keywords, and other irrelevant words to stopwords
stop.extend(['handmade','stoneware','pottery','ceramic','clay','x', '|','cm', 'oz', 'ml', ' ', '', 'www', 'com',
'shipping','item','items','hand','made','piece','pieces','make','please','one','use','may','inches','also',
'high','etsy', 'quot', 'wheel', 'thrown', 'wheelthrown', 'glaze', 'glazed','unique'])

In [81]:
# Define a function that removes the stopwords
""" 
Args:
txt (series): the text as a series of lists
wrds (list): the stopwords to be removed from the list
"""
def remove_stopwords(txt, wrds):
    for i in range (0, len(txt)):
        for s in wrds:
            lst = txt[i]
            lst = [w for w in lst if w != s]
            txt[i] = lst

In [82]:
remove_stopwords(desc, stop)
remove_stopwords(title, stop)

### Save the cleaned descriptions and titles to csv to use for clustering algorithm

In [84]:
#title.to_csv('data/titles-combined-cleaned.csv')
#desc.to_csv('data/descriptions-combined-cleanedA.csv')

### Word Counts

In [None]:
#define a function that reduces a column to a 1-dimensional series (to get word counts)
def oneD(column):
 return pd.Series([x for list in column for x in list])

In [None]:
#get the value counts for each of the words in the description
descWordCounts = oneD(desc).value_counts()

In [None]:
# Get the top words in the descriptions
top25desc = descWordCounts.head(25).sort_values(ascending=False)

In [None]:
# Get the value counts for each of the words in the titles
titleWordCounts = oneD(title).value_counts()

In [None]:
# Get the top 25 words in the titles
top25title = titleWordCounts.head(25).sort_values(ascending=False)

In [None]:
# plot top tags in different subsets of the data
fig, axes = plt.subplots(1,2, figsize = (14, 8))
fig.suptitle('Top Words')

# plot the top words in the descriptions
sns.barplot(ax=axes[0], x=top25desc.values, y=top25desc.index, palette='light:#639c6f_r').set(title='Top 25 Words in the Descriptions')

# plot the top wordsin the titles
sns.barplot(ax=axes[1],x=top25title.values, y=top25title.index, palette='light:#6255aa_r').set(title='Top 25 Words in the Titles')

## Tags

In [None]:
#make all tags lowercase
data['tags'] = data['tags'].str.lower()

In [None]:
#transform tags column from strings to lists
data['tags'] = data['tags'].apply(eval)

In [None]:
#get just the tags
tags = data['tags']

### Remove pottery-related stopwords from the tags

In [None]:
#define common stopwords
stopTags = ['handmade','stoneware','pottery','ceramic','clay','handmade pottery',
            'stoneware pottery','ceramics','handmade ceramics','gift','wheel thrown','one of a kind']

In [None]:
#remove stop words from tags
for i in range (0, len(tags)):
    for s in stopTags:
        tags_list = tags[i]
        if s in tags_list:
            tags_list.remove(s)

In [None]:
#add clean tags to original df
data['clean_tags'] = tags

In [None]:
# save dataframe with clean tags 
#data.to_csv('data/tags-cleaned041022.csv')

In [None]:
#get the value counts for each of the tags
tagCounts = oneD(tags).value_counts()

In [None]:
# Sort the top tags overall
top25 = tagCounts.head(25).sort_values(ascending=False)

In [None]:
# the word "spoon" is appearing in the top words in the titles so I wanted to see what the related tags are:
tagCounts[tagCounts.index.str.contains(pat = 'spoon')]

#### I repeated the above for all the top words in the titles that were products so I could see if they were normally used in a phrase like "spoon rest" rather than individually and used this later on in the "Business Information" notebook

### Get top 20 tags for top-favorited and top rated shops

In [None]:
#dataframe with tags, num of favorites
itemData = data[['listing_id','clean_tags','num_favorers_listing']]

In [None]:
#look at top 10 listings according to number of favorers
itemData.sort_values(by = ['num_favorers_listing'], ascending = False).head(10)

In [None]:
#this is the 3,873 listings with at least 10 favorers
TenOrMore = itemData[itemData['num_favorers_listing']>=10]

In [None]:
#get the value counts for each of the tags in TenOrMore
tagCountsFavs = oneD(TenOrMore["clean_tags"]).value_counts()

In [None]:
# Sort the top 25 tags from the listings with ten or more favorers
top25Favs = tagCountsFavs.head(25).sort_values(ascending=False)

### Get top 25 tags for shops with at least 10 review and review averages greater than 4.9

In [None]:
# dataframe with tags, shop review count, and review average
shopData = data[['listing_id','clean_tags','num_favorers_listing','review_average', 'review_count']]

In [None]:
# get the shops that have at least 10 reviews
shops10reviews = shopData[shopData['review_count']>=10]

In [None]:
#this is the 9036 listings from shops with at least 10 reviews
TenReviews =  shopData[shopData['review_count']>=10]

In [None]:
# this is the 8106 listings from shops with at least 10 reviews AND a 4.8 star rating or higher
topShops =  TenReviews[TenReviews['review_average']>=4.9]

In [None]:
#get the value counts for each of the tags in the topShops listings
tagCountsTopShops = oneD(topShops["clean_tags"]).value_counts()

# Sort the top 25 tags from the listings with ten or more favorers
top25TopShops = tagCountsTopShops.head(25).sort_values(ascending=False)

In [None]:
# plot top tags in different subsets of the data
fig, axes = plt.subplots(3, 1, figsize = (12, 18))
fig.suptitle('Top Tags')

# plot all the top tags overall
sns.barplot(ax=axes[0], x=top25.values, y=top25.index, palette='light:#4079bf_r')
axes[0].set_title('Top 25 Tags All Listings')

# plot the top tags for listings with 10 or more favorers
sns.barplot(ax=axes[1], x=top25Favs.values, y=top25Favs.index, palette='light:#b96cf0_r')
axes[1].set_title('Top 25 Tags for Listings w/10 or more Favorers')

# plot the top tags for listings from shops with at least 10 reviews and a 4.9 star rating or higher
sns.barplot(ax=axes[2], x=top25TopShops.values, y=top25TopShops.index, palette='light:#90aa55_r')
axes[2].set_title('Top 25 Tags for Listings from Shops w/4.9 or Higher Rating and At Least 10 Reviews')