In [6]:
import pandas as pd
import numpy as np
import sys
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import RegexpTokenizer

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import DistanceMetric, KNeighborsRegressor
from sklearn.datasets import load_iris, make_classification

from scipy import sparse



### Load in Data

In [7]:
strain_data = pd.read_csv('Data/Cannabis_Data.csv')

In [8]:
url = "https://www.kaggle.com/nvisagan/cannabis-strains-features"

### Search Methods

In [9]:
def find_strain(name):
    return strain_data['Description'][strain_data['Strain'].str.contains(name)]

In [10]:
def find_descrip(word):
    return strain_data[strain_data['Description'].str.contains(word)]

In [11]:
strain_data['Flavor'].fillna('Unknown', inplace=True)

In [12]:
strain_data['Description'].fillna('None', inplace=True)

In [13]:
strain_data['Strain'] = strain_data['Strain'].str.replace('-', ' ')
strain_data['Strain'] = strain_data['Strain'].str.replace('Ar 4', 'Ar-4')

In [14]:
list_of_strains = list(strain_data['Strain'])

In [15]:
strain_breakdown = pd.DataFrame(strain_data['Strain'])
strain_breakdown['Rating'] = strain_data['Rating']

In [16]:
# Create Dummie Variables for Sativa, Indica, and Hybrid.

strain_breakdown['Type'] = strain_data['Type']

strain_breakdown = pd.get_dummies(strain_breakdown, columns=['Type'])

strain_breakdown.columns = ['Strain', 'Rating', 'Type: Hybrid', 'Type: Indica', 'Type: Sativa']

In [17]:
#loop to extract all the words used.

flavor_options = []

for i in range(len(strain_data)):
    flavors = strain_data['Flavor'].iloc[i,]
    flavors = flavors.split(',')
    for flav in flavors:
        if flav not in flavor_options:
            flavor_options.append(flav)
        else:
            pass

In [18]:
print (flavor_options, end=' ')
# # I'm about 50/50 on using these, because I don't know how important taste is to the average user, but here they are.

['Earthy', 'Sweet', 'Citrus', 'Flowery', 'Violet', 'Diesel', 'Spicy/Herbal', 'Sage', 'Woody', 'Apricot', 'Grapefruit', 'Orange', 'None', 'Pungent', 'Grape', 'Pine', 'Skunk', 'Berry', 'Pepper', 'Menthol', 'Blue', 'Cheese', 'Chemical', 'Mango', 'Lemon', 'Peach', 'Vanilla', 'Nutty', 'Chestnut', 'Tea', 'Tobacco', 'Tropical', 'Strawberry', 'Blueberry', 'Mint', 'Apple', 'Honey', 'Lavender', 'Lime', 'Unknown', 'Coffee', 'Ammonia', 'Minty', 'Tree', 'Fruit', 'Butter', 'Pineapple', 'Tar', 'Rose', 'Plum', 'Pear'] 

In [19]:
for flavors in flavor_options:
    strain_breakdown['Terpenes: ' + flavors] = (strain_data['Flavor'].str.contains(flavors)).astype(int)

In [20]:
parent_strains = [
    
    'Aceh', 'Hindu Kush','OG Kush', 'Sour Diesel', 'Granddaddy Purple', 'Northern Lights',
    'Durban Poison', 'Bubba Kush', 'Pre-98 Bubba Kush', 'Jack Herer', 'Blue Dream', 'Trainwreck',
    'Hawaiian', 'Amnesia', 'Super Silver Haze', 'OG Badazz', 'Ms. Universe', 'LSD', 'Banana OG',
    'White Widow', 'Nepali OG', 'Afgoo', 'Appalachia', 'Harlequin', 'Jack the Ripper', 'Pennywise',
    'Lilly', 'Headband', 'Snowdawg', 'Snow Lotus', 'Green Crack', 'Alien OG', 'Alien Kush', 'Alien Dawg',
    'Super Silver Haze', 'Chernobyl', 'Elephant', 'Apollo 13', 'Space Queen', "Jack’s Cleaner",
    'LA Confidential', 'Maui Wowie', 'Gorilla Glue', 'AK-47', 'Blue Cheese', 'Deep Chunk', 'G13', 
    "Rare Dankness #1", 'Skywalker', 'Skywalker OG', 'Master Kush', 'SFV OG', 'Gooberry', 'Hell’s Angel OG',
    'Captain Krypt', 'Sour Bubble', 'Shiva', 'Blue Moonshine', 'Ortega', 'Fire OG', 'Green Ribbon',
    'Triangle Kush', 'Georgia Pine', 'Shishkaberry', 'Great White Shark', 'Mazar I Sharif', 
    
    'Nepalese', 'Afghani', 'Thai', 'Skunk', 'The White', 'Blackberry', 'Cheese',
    
]


# These are for strains that will need the first letter to be capitalized, since the word appears in other ways.
title_parent_strains = [     
     
]

In [21]:
for parents in parent_strains:
    strain_breakdown['Parent: ' + parents] = (strain_data['Description'].str.contains(parents)).astype(int)

In [22]:
gsc = ['Girl Scout Cookie', 'GSC']
strain_breakdown['Parent: Girl Scout Cookies'] = (strain_data['Description'].apply(lambda x: any(word in x for word in gsc))).astype(int)

chemdawg91 = ['chemdawg ‘91', 'chemdawg 91']
strain_breakdown['Parent: Chemdawg 91'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in chemdawg91))).astype(int)

chemdawg = ['chemdawg', 'chem dawg', 'chem dog']
strain_breakdown['Parent: Chemdawg'] = (strain_data['Description'].str.lower().apply(lambda x: any(word in x for word in chemdawg))).astype(int)

In [23]:
thc_cbd = pd.DataFrame(strain_data['Strain'])

def find_ratio(string, name):
    df = strain_data['Description'].str.extract(string)
    df['CBD'].fillna(1, inplace=True)
    df['THC'].fillna(0, inplace=True)
    thc_cbd[name] = df['THC'].astype(int) / df['CBD'].astype(int)

In [24]:
find_ratio(r'THC:CBD ratio of about (?P<THC>\d+):(?P<CBD>\d+)', 'thc_cbd_1')
find_ratio(r'THC:CBD ratio of (?P<THC>\d+):(?P<CBD>\d+)', 'thc_cbd_2')
find_ratio(r'THC/CBD ratio of about (?P<THC>\d+):(?P<CBD>\d+)', 'thc_cbd_3')
find_ratio(r'THC/CBD ratio of (?P<THC>\d+):(?P<CBD>\d+)', 'thc_cbd_4')
find_ratio(r'(?P<THC>\d+):(?P<CBD>\d+) THC:CBD', 'thc_cbd_5')
find_ratio(r'(?P<THC>\d+):(?P<CBD>\d+) THC/CBD', 'thc_cbd_6')

find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) CBD/THC', 'cbd_thc_1')
find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) CBD:THC', 'cbd_thc_2')
find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) CBD to THC', 'cbd_thc_3')
find_ratio(r'CBD to THC ratio of (?P<CBD>\d+):(?P<THC>\d+)', 'cbd_thc_4')
find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) ratio of CBD to THC', 'cbd_thc_5')
find_ratio(r'CBD:THC ratio of (?P<CBD>\d+):(?P<THC>\d+)', 'cbd_thc_6')
find_ratio(r'(?P<CBD>\d+)-to-(?P<THC>\d+) CBD:THC', 'cbd_thc_7')
find_ratio(r'(?P<CBD>\d+):(?P<THC>\d+) 59:1 (CBD:THC)', 'cbd_thc_8')


one_to_one = ['1:1 CBD/THC', '1:1 THC/CBD', '1:1 THC-CBD', '1:1 CBD-THC', '1:1 ratio of CBD:THC', '1:1 ratio of THC:CBD'
              '1:1 ratio of THC:CBD', 'BD:THC ratio can range anywhere from 1:1 – 2:1', 'balanced THC/CBD',
              '1:1 ratio of CBD to THC', '1:1 ratio of THC to CBD', '1:1 ratio of THC and CBD', '1:1 ratio of CBD and THC',
              'THC to CBD ratio typically comes out 1:1', '1:1 CBD-THC', 'balanced 1:1 cannabinoid', 'balanced THC:CBD ratio']
thc_cbd['Equal'] = (strain_data['Description'].apply(lambda x: any(word in x for word in one_to_one))).astype(int)

In [25]:
strain_frame = strain_breakdown.set_index('Strain')

In [26]:
strain_frame.to_csv('Data/Strain_Frame.csv')

In [27]:
np.set_printoptions(threshold=1000, suppress=None)

In [28]:
X = strain_frame

In [29]:
pivot = strain_frame.pivot_table

In [30]:
pivot_sparse = sparse.csr_matrix

In [31]:
rec_df = pd.DataFrame(X)
rec_df.head(20000)

Unnamed: 0_level_0,Rating,Type: Hybrid,Type: Indica,Type: Sativa,Terpenes: Earthy,Terpenes: Sweet,Terpenes: Citrus,Terpenes: Flowery,Terpenes: Violet,Terpenes: Diesel,...,Parent: Nepalese,Parent: Afghani,Parent: Thai,Parent: Skunk,Parent: The White,Parent: Blackberry,Parent: Cheese,Parent: Girl Scout Cookies,Parent: Chemdawg 91,Parent: Chemdawg
Strain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100 Og,4.0,1,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98 White Widow,4.7,1,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1024,4.4,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13 Dawgs,4.2,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
24K Gold,4.6,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zeus Og,4.7,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Zkittlez,4.6,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Zombie Kush,5.0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Zombie Og,4.4,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [32]:
rec_df.to_csv('Data/Strain_Recommender.csv')

In [33]:
rec_df.shape

(2351, 130)

In [34]:
with open('Data/Strain_Dictionary.txt', 'w') as f:
    f.writelines('\n'.join(list_of_strains))

In [35]:
Strain_List = {K: 1 for K in list_of_strains}

with open('Data/Strain_List.json', 'w') as f:
    json.dump(Strain_List, f)

In [36]:
sc = ('Data/Strain_List.json')

In [37]:
def suggestion(strain):
    strain_title = strain.title()
    try:
        print (f"Strains similar to {strain.upper()} include ")
        result = recommender_df[strain_title].sort_values(ascending=False)[1:11]
        return result.astype(str).map(lambda x: x + '%')
    except:
        if sc != strain:
            return(strain_title)
        else:
            return( '''Strain Not Found. 
If you searched the full strain name, try just the initials.''')

In [38]:
suggestion('Og Kush')

Strains similar to OG KUSH include 


'Og Kush'