# Getting the data

In [None]:
import requests
import json
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import random

In [None]:
APP_ID = '1088230580303060389' ## This is the developer ID we use for this course project
APP_ID_shinobu = '1071135727571229389' ## Another developer ID for this course. Note that different ID gives different ranking results

In [None]:
def simple_get_endpoint(age): ## Basic structure of defining get_endpoint instruction (Attributes can be stacked)
  return f'https://app.rakuten.co.jp/services/api/IchibaItem/Ranking/20220601?age={age}'
def get_endpoint(age, sex): ## Defines criteria for search query (Sex=0 for male, Sex=1 for female, age must be integer, divideable by 10)
  return f'https://app.rakuten.co.jp/services/api/IchibaItem/Ranking/20220601?age={age}&sex={sex}'

## Inspecting data frame

In [None]:
params_shinobu = {
        'applicationId': APP_ID_shinobu,
        'format': 'json',
    }

shinobu_data1 = requests.get(simple_get_endpoint(20), params=params_shinobu).json()
print(shinobu_data1.keys())
for i in range(3): ## Showing top 3 from items ranking for 20s customers. Item information will change as time passes due to dynamic ranking
    display(shinobu_data1['Items'][i]['Item'])

## Overall item ranking for women@20s

In [None]:
params = {
        'applicationId': APP_ID,
        'format': 'json',
        'keyword': '-reviewCount'
    }

jenny_data = requests.get(get_endpoint(20, 1), params=params).json()

In [None]:
jenny_data.keys()

In [None]:
jenny_data['title']

In [None]:
len(jenny_data['Items'])

In [None]:
jenny_data['Items'][0]

### It turns out that if you specify the gender and age you won't get the reviews

In [None]:
params = {
        'applicationId': APP_ID,
        'format': 'json',
        'keyword': '-reviewCount'
    }

Getting data for 34 pages

In [None]:
data = []
for i in range(1,35):
  res = requests.get(f'https://app.rakuten.co.jp/services/api/IchibaItem/Ranking/20220601?page={i}', params=params).json()
  # dropping: mediumImageUrls / smallImageUrls
  for item in res['Items']:
    if 'mediumImageUrls' in item['Item']:
      del item['Item']['mediumImageUrls']
    if 'smallImageUrls' in item['Item']:
      del item['Item']['smallImageUrls']
    data.append(item['Item'])
data

In [None]:
len(data)

In [None]:
res['Items'][0]['Item'].keys()

In [None]:
data_keys = list(data[0].keys())

In [None]:
#creating a dictionary from data
new_dict = {}
for attribute in data_keys:
  values = []
  for item in data:
    if attribute in item:
      values.append(item[attribute])
    else:
      values.append('')
  new_dict[attribute] = values

In [None]:
df = pd.DataFrame(new_dict)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
float_col = ['itemPrice', 'itemPriceMax1', 'itemPriceMax2', 'itemPriceMax3', 'itemPriceMin1', 'itemPriceMin2', 'itemPriceMin3', 'reviewAverage', ]

In [None]:
# turn string number into actual float
df[float_col] = df[float_col].astype(float)

In [None]:
df.isnull().sum()

In [None]:
# drop columns with uniform values or is completely empty
df = df.drop(['carrier', 'shipOverseasFlag', 'taxFlag', 'affiliateUrl', 'imageFlag', 'asurakuArea', 'asurakuClosingTime', 'asurakuFlag', 'shopAffiliateUrl', 'creditCardFlag'], axis=1)

In [None]:
df.describe()

In [None]:
# item with price range
df[float_col].join(df['hasPriceRange'])[(df['hasPriceRange'] == 1)]

In [None]:
# priceMax1 that differs from priceMax2
df[float_col].join(df['hasPriceRange'])[(df['itemPriceMin2'] != df['itemPriceMin3'])].head()

In [None]:
df['itemPriceBaseField'].unique()

In [None]:
df.loc[37]

In [None]:
df[float_col].join(df['hasPriceRange'])[(df['itemPriceMin2'] < df['itemPriceMin3'])]

In [None]:
# item that does not display price range
df[float_col].join(df['hasPriceRange'])[(df['hasPriceRange'] == 0)]

In [None]:
df.loc[990]

### Does review availability correlate with displaying Price Range?

In [None]:
df[(df['reviewAverage'] == 0)].shape

In [None]:
df[(df['reviewAverage'] == 0) & (df['hasPriceRange'] == 1)].shape
# if it has price range, customers tend to leave reviews (lower review average = 0 count)

In [None]:
# filter for item with review count
has_review = df[df['reviewCount'] > 0]
has_review.shape

In [None]:
df['has_review'] = df['reviewCount'].apply(lambda x: 0 if x == 0 else 1)

In [None]:
contingency = pd.crosstab(df['has_review'], df['hasPriceRange'])
contingency

### Whether displaying Price Range correlates the reviewAverage

In [None]:
reviewAverage_on_priceRange = has_review.groupby('hasPriceRange')['reviewAverage'].apply(list)

In [None]:
t_test, p_value = stats.ttest_ind(*reviewAverage_on_priceRange)
p_value

wow, that was unexpected

In [None]:
reviewAverage_on_priceRange[0] = random.sample(reviewAverage_on_priceRange[0], 298)

In [None]:
# no. of reviewed item with price range vs. w/o price range
len(reviewAverage_on_priceRange[1]), len(reviewAverage_on_priceRange[0])

In [None]:
hasPriceRange_reviewMean = sum(reviewAverage_on_priceRange[1])/len(reviewAverage_on_priceRange[1])
hasPriceRange_reviewMean

In [None]:
# std w/ review
hasPriceRange_reviewSTD = np.std(reviewAverage_on_priceRange[1], ddof=1)
hasPriceRange_reviewSTD

In [None]:
plt.title('Distribution of Review Average of Item w/ Price Range')
plt.hist(reviewAverage_on_priceRange[1])

In [None]:
noPriceRange_reviewMean = sum(reviewAverage_on_priceRange[0])/len(reviewAverage_on_priceRange[0])
noPriceRange_reviewMean

In [None]:
plt.title('Distribution of Review Average of Item w/o Price Range')
plt.hist(reviewAverage_on_priceRange[0])

In [None]:
# std w/o review
noPriceRange_reviewSTD = np.std(reviewAverage_on_priceRange[0], ddof=1)
noPriceRange_reviewSTD

In [None]:
# difference
noPriceRange_reviewMean - hasPriceRange_reviewMean

- We are safe to reject the null hypothesis that the displaying price range does not correlate with review average.
- Items without price range tends to have higher review average than that with it

### How does review count correlate with review averge?

In [None]:
plt.scatter(has_review['reviewCount'], has_review['reviewAverage'])

In [None]:
has_review[['hasPriceRange', 'reviewCount']][has_review['reviewAverage'] < 3.5]

In [None]:
plt.scatter(has_review['reviewCount'], has_review['reviewAverage'], c=has_review['hasPriceRange'])

# Point System

In [None]:
sorted(df['pointRate'].unique())

In [None]:
plt.hist(df['pointRate'])

# Availability and Review Count

In [None]:
plt.hist(has_review[has_review['availability'] == 0]['reviewCount'])

In [None]:
df.info()

In [None]:
num_cols = ['availability', 'hasPriceRange'] + float_col + ['pointRate', 'postageFlag', 'rank', 'reviewCount', 'shopOfTheYearFlag']

In [None]:
corr = df[num_cols].corr()

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)

In [None]:
df.groupby('genreId').count()

# Contact Lens

In [None]:
df[df['genreId'] == '563999']

In [None]:
lens_df = pd.concat([df[df['genreId'] == '563999'], df[df['genreId'] == '408099']])
lens_df.shape

In [None]:
df['genreId'].value_counts()

In [None]:
lens_df.info()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(lens_df[num_cols].corr(), annot=True, cmap='coolwarm', center=0)

### Review Count & Review Average

In [None]:
plt.scatter(lens_df['reviewCount'], lens_df['reviewAverage'])

### Postage Flag & Review Average

- we hypothesized that the postage flag will negatively correlated with the review average.

**PostageFlag**
- 0: Postage included (送料無料, free shipping)
- 1: Postage not included (buyer pays shipping)

So, we want to check whether free shipping correlates with higher review averages

$H_0$: there is no difference in review average between items with and without free shipping.

$H_1$: items with free shipping have higher review averages.

In [None]:
df[df['postageFlag'] == 0].loc[1]['itemUrl']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x='postageFlag', y='reviewAverage', data=lens_df)
plt.title('Review Average by Postage Flag')
plt.xlabel('Postage Flag (0 = Free Shipping)')
plt.ylabel('Review Average')
plt.show()

In [None]:
lens_df['postageFlag'].value_counts()

In [None]:
from scipy.stats import ttest_ind

free = lens_df[lens_df['postageFlag'] == 0]['reviewAverage']
not_free = lens_df[lens_df['postageFlag'] == 1]['reviewAverage']

t_stat, p_value = ttest_ind(free, not_free)

print(f't-statistic: {t_stat:.4f}')
print(f'p-value: {p_value:.4f}')

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print("Reject the null hypothesis.")
    print("There is an evidence to suggest that free shipping tend to have higher review averages.")
else:
    print("Fail to reject the null hypothesis.")
    print("There is not enough evidence to suggest that free shipping tend to have higher review averages.")

#has_review df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(x='postageFlag', y='reviewAverage', data=has_review)
plt.title('Review Average by Postage Flag')
plt.xlabel('Postage Flag (0 = Free Shipping)')
plt.ylabel('Review Average')
plt.show()

In [None]:
has_review['postageFlag'].value_counts()


In [None]:
from scipy.stats import ttest_ind

free = has_review[has_review['postageFlag'] == 0]['reviewAverage']
not_free = has_review[has_review['postageFlag'] == 1]['reviewAverage']

t_stat, p_value = ttest_ind(free, not_free)

print(f't-statistic: {t_stat:.4f}')
print(f'p-value: {p_value:.4f}')

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print("Reject the null hypothesis.")
    print("There is an evidence to suggest that free shipping tend to have higher review averages.")
else:
    print("Fail to reject the null hypothesis.")
    print("There is not enough evidence to suggest that free shipping tend to have higher review averages.")

### shopOfTheYearFlag & postageFlag
- Hypothesis: shopOfTheYearFlag has positively correlated with postageFlag
- H1: Winning shops are more likely to offer free shipping.
- H2: Winning shops are more likely to charge for shipping.

- H0: There is no correlations between winning shops and shipping charge.

**shopOfTheYearFlag**
- 0: Shops that have not won Shop of the Year
- 1: Shops that have won Shop of the Year

In [None]:
contingency_table = pd.crosstab(lens_df['shopOfTheYearFlag'], lens_df['postageFlag'])
print(contingency_table)

In [None]:
# percentage of contingency table
contingency_table / contingency_table.sum()*100

In [None]:
from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f'Chi-squared statistic: {chi2:.4f}')
print(f'p-value: {p}')
print(f'Degrees of freedom: {dof}')

In [None]:
# Interpret the results
alpha = 0.05  # Significance level
if p < alpha:
    print("Reject the null hypothesis.")
    print("There is a relationship between winning shops and shipping charge.")
else:
    print("Fail to reject the null hypothesis.")
    print("There is no relationship between shop of the year flag and postage flag. (no relationship between winning shops and shipping charge)")

Observing the contingency table:
We found that there is a little difference in the postage flag for winning shops. However, ordinary shops (not winning the award) are more likely to have free shipping(???) So, shop of the year winners are more likely to charge for shipping than non-winners.

In [None]:
#Let's change the dataset from contactlens to the bigger dataset
df.shape

In [None]:
lens_df.shape

In [None]:
contingency_table = pd.crosstab(df['shopOfTheYearFlag'], df['postageFlag'])
print(contingency_table)

In [None]:
from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f'Chi-squared statistic: {chi2:.4f}')
print(f'p-value: {p}')
print(f'Degrees of freedom: {dof}')

In [None]:
# Interpret the results
alpha = 0.05  # Significance level
if p < alpha:
    print("Reject the null hypothesis.")
    print("There is a relationship between winning shops and shipping charge.")
else:
    print("Fail to reject the null hypothesis.")
    print("There is no relationship between shop of the year flag and postage flag. (no relationship between winning shops and shipping charge)")

###驚いことに　we fail to reject the null hypothesis when we have more data...I don't know... help me, zach ;-;

then what do we find out about this hypothesis???

### pointRate & reviewAverage
- Hypothesis: point rate has positive correlate with review average
- Null Hypothesis: there is no correlation between point rate and review average.

In [None]:
#contact lens dataset
sns.scatterplot(x='pointRate', y='reviewAverage', data=lens_df)
plt.title('Review Average by Point Rate')
plt.xlabel('Point Rate')
plt.ylabel('Review Average')
plt.show()

In [None]:
from scipy.stats import pearsonr

r, p = pearsonr(lens_df['pointRate'], lens_df['reviewAverage'])
print(f"Correlation coefficient: r = {r:.3f}, p-value = {p:.3f}")

if p < 0.05:
    print("There is a statistically significant correlation between point rate and review average.")
else:
    print("There is no statistically significant correlation between point rate and review average.")

In [None]:
#df dataset
sns.scatterplot(x='pointRate', y='reviewAverage', data=df)
plt.title('Review Average by Point Rate')
plt.xlabel('Point Rate')
plt.ylabel('Review Average')
plt.show()

In [None]:
from scipy.stats import pearsonr

r, p = pearsonr(df['pointRate'], df['reviewAverage'])
print(f"Correlation coefficient: r = {r:.3f}, p-value = {p:.3f}")

if p < 0.05:
    print("There is a statistically significant correlation between point rate and review average.")
else:
    print("There is no statistically significant correlation between point rate and review average.")

In this case, we can reject the null hypothesis when using the df dataset. When we use the contactlens dataset, we fail to reject the null hypothesis

Lastly, we need to think why do we fail to reject the null hypthesis when analyzing the df for shopOfTheYearFlag & postageFlag.

But why can we can reject when the data is small? or the reason might be relate to Japanese cultures, regardless of the award, a customer needs to pay for shipping??? hahaa

###Note
If we have time, should we get a new variable like 'image-num'. So, we can observe whether the number of images has an effect on the number of reviews, ranking, and more.

-> Alright, we should not do that because the data doesn't give many images information. there are only small and medium image url.