In [195]:
import pandas as pd
import numpy as np

import plotly.io as pio
import plotly.express as px

import ast

import re

import spacy
import spacy_fastlang

pio.renderers.default = "vscode"
pd.set_option('plotting.backend', 'plotly')
pd.options.mode.chained_assignment = None  # default='warn'

```python -m spacy download en_core_web_sm```

In [119]:
products = pd.read_json('data/cerave-products.json', orient='records')
products.set_index('product_id', inplace=True)
reviews = pd.read_csv('data/cerave-reviews.csv', low_memory=False, parse_dates=['date'])

In [120]:
products['total_reviews'] = reviews.groupby('product_id')['review_id'].count()

In [121]:
products['category'] = products.categories.apply(lambda x : x[0]['hierarchy'][0]['displayName']['en'])

In [122]:
df = pd.merge(reviews, products, on='product_id')
df.set_index('date', inplace=True)

In [123]:
fig = px.bar(
    x=df.groupby('category').count().index, 
    y=df.groupby('category').count()['sku_ids'],
    labels={
        'x': 'Categories',
        'y': '# of Ratings'
    }
)

fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

In [124]:
fig = px.bar(
    x=df.groupby('category').mean().index, 
    y=df.groupby('category').mean()['rating'],
    labels={
        'x': 'Categories',
        'y': 'Mean Rating'
    }
)

fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.update_yaxes(range=[4, 4.8])
fig.show()

In [125]:
df.text.isna().sum()

7

In [126]:
df.dropna(inplace=True, subset='text')

In [127]:
nlp = spacy.blank('en')
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x3bd21cc40>

In [128]:
def sentence(text):
    return list(nlp(text).sents)

df['sents'] = df['text'].apply(sentence)

In [129]:
df['word_count'] = df.sents.apply(lambda x : sum([len(y) for y in x]))

In [130]:
px.histogram(df, x='word_count', nbins=100, labels={'count' : 'Entry Count', 'word_count' : 'Word Count'})

In [132]:
df = df[df.word_count <= 75]

In [149]:
nlp = spacy.blank('en')
nlp.add_pipe("language_detector")



<spacy_fastlang.LanguageDetector at 0x416227520>

In [194]:
df = df[df.text.apply(lambda x : nlp(x)._.language == 'en')]

In [199]:
text_examp = 'Love it! I’ve been using this product every day for more than 6 months, and it helps my skin feel clean yet not dried. It contains ceramides, niacinamide and hyaluronic acid'

In [200]:
re.sub(r'http\S+', '', text_examp)

' Love it! I’ve been using this product every day for more than 6 months, and it helps my skin feel clean yet not dried. It contains ceramides, niacinamide and hyaluronic acid'

In [None]:
def format_text(text):
    re.sub(r'http\S+', '', text) # Removes urls
    