In [None]:
import pandas as pd
import numpy as np
import altair as alt

In [None]:
google_data = pd.read_csv("google-patent-data.csv")
google_data.head()

In [None]:
scraped_data = pd.read_csv("scraped_patents.csv")
scraped_data.head()

In [None]:
google_data.shape

In [None]:
scraped_data.shape

In [None]:
patents = pd.merge(google_data, scraped_data, left_on='result link', right_on='url')
patents.head()

In [None]:
patents_cleaned = patents.drop(columns=['id','result link', 'representative figure link', 'flesh.brix_text'])
patents_cleaned.head()

In [None]:
patents_cleaned.dtypes

In [None]:
patents_cleaned['grant date'] = pd.to_datetime(patents_cleaned['grant date'])

patents_cleaned['grant year'] = patents_cleaned['grant date'].dt.year

patents_cleaned.head()

In [None]:
year_column = patents_cleaned.pop('grant year')

In [None]:
patents_cleaned.insert(1, 'year', year_column)

In [None]:
patents_cleaned.head()

In [None]:
patents_cleaned['pluot name'] = patents_cleaned['title'].str.extract(r'[\'"`“”‘’](.*?)[\'"`“”‘’]', expand=False)

In [None]:
patents_cleaned.head()

In [None]:
patents_cleaned['pluot name'] = patents_cleaned['pluot name'].str.title()

In [None]:
patents_cleaned['pluot name'].to_string(index=False)

In [None]:
name_column = patents_cleaned.pop('pluot name')

In [None]:
patents_cleaned.insert(1, 'pluot name', name_column)

In [None]:
patents_cleaned.head()

In [None]:
patents_cleaned.to_csv('patents_cleaned.csv', index=False)

In [None]:
handpicked = pd.read_csv("handpicked-data.csv")
handpicked.head()

In [None]:
pluots_all = pd.merge(patents_cleaned, handpicked, left_on='pluot name', right_on='Varietal', how='outer')
pluots_all.head()

In [None]:
pluots_all.shape

In [None]:
pluots_all['pluot name'] = pluots_all['pluot name'].combine_first(pluots_all['Varietal'])

In [None]:
pluots_all.drop(columns='Varietal')

In [None]:
pluots_all.head()

In [None]:
pluots_all.to_csv("pluots_all.csv", index=False)

In [None]:
pluots_cleaned = pd.read_csv("pluots_cleaned.csv")
pluots_cleaned.head()

In [None]:
pluots_cleaned.dtypes

In [None]:
pluots_cleaned['year'] = pluots_cleaned['year'].astype('Int64')

In [None]:
pluots_cleaned['year'].value_counts().sort_index().plot(kind='bar')

In [None]:
pluots_year_brix = pluots_cleaned[['pluot name', 'year', 'scraped_brix']]
pluots_year_brix.head()

In [None]:
pluots_year_brix.plot.scatter(x='year', y='scraped_brix')

In [None]:
pluots_taste = pd.read_csv("pluot_taste_scores.csv")
pluots_taste.head()

In [None]:
pluots_taste.rename(columns={'name': 'pluot name'}, inplace=True)

In [None]:
pluots_taste.plot(kind='scatter', x='acid', y='flavor')

In [None]:
pluots_taste.plot(kind='scatter', x='sugar', y='flavor')

In [None]:
pluots_taste.plot(kind='scatter', x='flavor', y='overall')

In [None]:
pluots_taste_means = pluots_taste.groupby('pluot name')[['acid', 'sugar', 'flavor', 'overall']].mean()
pluots_taste_means.head()

In [None]:
pluots_taste_merged = pd.merge(pluots_taste_means, pluots_cleaned[['pluot name', 'scraped_brix']], how='left', on='pluot name')
pluots_taste_merged.head(10)

In [None]:
pluots_taste_merged.plot(kind='scatter', x='scraped_brix', y='overall')

In [None]:
pluots_taste_merged.plot(kind='scatter', x='sugar', y='flavor')

# To look at pluot colors

In [None]:
import ast

# from pluots_cleaned skin.hex and flesh.hex

# Assuming the column with the hex lists is called 'colors'
def extract_hexes(val):
    try:
        parsed = ast.literal_eval(val)
        return [color for color in parsed if color is not None]
    except (ValueError, SyntaxError):
        return []

# Flatten the entire column into one list of hex values
all_skin_hexes = pluots_cleaned['skin.hex'].apply(extract_hexes).sum()
all_flesh_hexes = pluots_cleaned['flesh.hex'].apply(extract_hexes).sum()

In [None]:
all_skin_hexes

In [None]:
all_flesh_hexes

In [None]:
import seaborn as sns

sns.set_theme()

def hex_to_rgb(hex_value):
  h = hex_value.lstrip('#')
  return tuple(int(h[i:i + 2], 16) / 255.0 for i in (0, 2, 4))

all_skin_rgb = list(map(hex_to_rgb, all_skin_hexes))

row_size = 37
rows = [all_skin_rgb[i:i + row_size] for i in range(0, len(all_skin_rgb), row_size)]

for row in rows:
  sns.palplot(row)

In [None]:
all_flesh_rgb = list(map(hex_to_rgb, all_flesh_hexes))

row_size = 40
rows = [all_flesh_rgb[i:i + row_size] for i in range(0, len(all_flesh_rgb), row_size)]

for row in rows:
  sns.palplot(row)