In [1]:
import pandas as pd

df = pd.read_csv('./cannabis_raw.csv')

In [2]:
print(df.shape)
df.head()

(2351, 6)


Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [3]:
df.isnull().any()

Strain         False
Type           False
Rating         False
Effects        False
Flavor          True
Description     True
dtype: bool

In [4]:
len(df['Strain'].unique())

2350

In [5]:
df.drop_duplicates(subset ='Strain', keep=False,inplace=True)
df = df[df['Description'].notnull()]
df = df[df['Flavor'].notnull()]
df = df[df['Rating']>4]
df.reset_index(inplace = True)
df.shape

(2010, 7)

In [6]:
## Save new dataframe to a pickle
df.to_pickle("dummy.pkl")

In [7]:
"""
Import Statements
"""

# Base
from collections import Counter
import re

# NLP Libraries
import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_lg")

In [8]:
import re
def tokenize(text):
    """Parses a string into a list of semantic units (words)

    Args:
        text (str): The string that the function will tokenize.

    Returns:
        list: tokens parsed out by the mechanics of your choice
    """
    
    tokens = re.sub(r'[^a-zA-Z ^0-9]', ',', text)
    tokens = tokens.lower().replace(',', ' ')
    tokens = tokens.split()
    
    return tokens

In [9]:
df['Effects_tokens'] = df['Effects'].apply(tokenize)
df['Effects_tokens'].head()

0     [relaxed, aroused, creative, happy, energetic]
1    [uplifted, happy, relaxed, energetic, creative]
2      [tingly, creative, hungry, relaxed, uplifted]
3    [happy, relaxed, euphoric, uplifted, talkative]
4       [relaxed, euphoric, happy, uplifted, hungry]
Name: Effects_tokens, dtype: object

In [10]:
df['Flavor_tokens'] = df['Flavor'].apply(tokenize)
df['Flavor_tokens'].head()

0        [flowery, violet, diesel]
1     [spicy, herbal, sage, woody]
2    [apricot, citrus, grapefruit]
3         [citrus, earthy, orange]
4         [earthy, sweet, pungent]
Name: Flavor_tokens, dtype: object

In [11]:
# Object from Base Python
from collections import Counter

# The object `Counter` takes an iterable, but you can instaniate an empty one and update it. 
word_counts = Counter()

def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(list(temp), columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc

In [12]:
wc_effects = count(df['Effects_tokens'])
print(wc_effects.shape)
wc_effects.sort_values(by='rank')

(14, 7)


Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
1,happy,1697,1697,1.0,0.171535,0.171535,0.844279
4,relaxed,1569,1569,2.0,0.158597,0.330132,0.780597
8,euphoric,1472,1472,3.0,0.148792,0.478924,0.732338
5,uplifted,1345,1345,4.0,0.135955,0.614879,0.669154
2,creative,652,652,5.0,0.065905,0.680784,0.324378
12,sleepy,644,644,6.0,0.065097,0.745881,0.320398
3,energetic,566,566,7.0,0.057212,0.803093,0.281592
11,focused,532,532,8.0,0.053775,0.856868,0.264677
7,hungry,405,405,9.0,0.040938,0.897807,0.201493
9,talkative,303,303,10.0,0.030628,0.928434,0.150746


In [13]:
wc_flavor = count(df['Flavor_tokens'])
print(wc_flavor.shape)
wc_flavor.sort_values(by='rank')

(51, 7)


Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
11,earthy,996,996,1.0,0.162586,0.162586,0.495522
13,sweet,969,969,2.0,0.158178,0.320764,0.48209
9,citrus,473,473,3.0,0.077212,0.397976,0.235323
12,pungent,410,410,4.0,0.066928,0.464904,0.20398
18,berry,336,336,5.0,0.054848,0.519752,0.167164
15,pine,276,276,6.0,0.045054,0.564806,0.137313
0,flowery,241,241,7.0,0.039341,0.604146,0.1199
3,woody,228,228,8.0,0.037218,0.641365,0.113433
2,diesel,220,220,9.0,0.035913,0.677277,0.109453
5,spicy,205,205,10.0,0.033464,0.710741,0.10199


In [14]:
print(len(wc_effects['word'].to_list()))
wc_effects = wc_effects.sort_values(by='rank')
wc_effects['word'].to_list()

14


['happy',
 'relaxed',
 'euphoric',
 'uplifted',
 'creative',
 'sleepy',
 'energetic',
 'focused',
 'hungry',
 'talkative',
 'tingly',
 'giggly',
 'aroused',
 'none']

In [15]:
wc_flavor = wc_flavor.sort_values(by='rank')
print(len(wc_flavor['word'].to_list()))
wc_flavor['word'].to_list()[:26]

51


['earthy',
 'sweet',
 'citrus',
 'pungent',
 'berry',
 'pine',
 'flowery',
 'woody',
 'diesel',
 'spicy',
 'herbal',
 'lemon',
 'skunk',
 'tropical',
 'blueberry',
 'grape',
 'orange',
 'cheese',
 'pepper',
 'lime',
 'strawberry',
 'grapefruit',
 'sage',
 'minty',
 'pineapple',
 'none']

### Flavors: 
Earthy, Sweet, Citrus, Pungent, Berry, Pine, Flowery, Woody,
Spicy, Herbal, Lemon, Lavender, Tropical, Blueberry, Grape,
Orange, Pepper, Lime, Strawberry, Grapefruit, Sage,
Minty, Pineapple, None, Vanilla, Apple.

### Effects: 
Happy, Relaxed, Euphoric, Uplifted, Creative, Sleepy, Energetic,
Focused, Hungry, Talkative, Tingly, Giggly, Aroused, None.

### Ailments:
Depression, Inflammation, Insomnia, Lack of Appetite, Muscle Spasms,
Nausea, Pain, Seizures, Stress, Anxiety, Headaches, Fatigue.

In [16]:
df['Description'] = df['Description'].str.replace(u'\xa0', u' ')
df['Description'].head()

0    The ‘98 Aloha White Widow is an especially pot...
1    1024 is a sativa-dominant hybrid bred in Spain...
2    13 Dawgs is a hybrid of G13 and Chemdawg genet...
3    Also known as Kosher Tangie, 24k Gold is a 60%...
4    The 3 Kings marijuana strain, a holy trinity o...
Name: Description, dtype: object

In [17]:
# Tokenizer Pipe
from spacy.tokenizer import Tokenizer

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

tokens = []
STOP_WORDS = nlp.Defaults.stop_words.union([' '])

""" Make them tokens """
for doc in tokenizer.pipe(df['Description'], batch_size=500):
    doc_tokens = []   
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower()) 
    tokens.append(doc_tokens)

df['Description_tokens'] = tokens

In [18]:
#wc_description = count(df['Description_tokens'])
#print(wc_description.shape)
#wc_description.head()

In [19]:
df['General_Description'] = df['Effects']+""+df['Flavor']+""+df['Description']

In [20]:
df['General_Description'][0]

'Relaxed,Aroused,Creative,Happy,EnergeticFlowery,Violet,DieselThe ‘98 Aloha White Widow is an especially potent cut of White Widow that has grown in renown alongside Hawaiian legends like Maui Wowie and Kona Gold. This White Widow phenotype reeks of diesel and skunk and has a rich earthy taste with intermittent notes of hash. Its buds are coated in trichomes, giving its dark foliage a lustrous glint to go along with its room-filling odor. This one-hitter-quitter uplifts the mind with mind-bending euphoria that materializes in the body as airy relaxation. ‘98 Aloha White Widow is available from Pua Mana 1st Hawaiian Pakalōlō Seed Bank.  '

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
dtm = tfidf.fit_transform(df['General_Description'])

# Save tfidf to a pickle
pickle.dump(tfidf, open("tfidf.pkl", "wb"))

# Get feature names to use as dataframe column headers
general_dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
general_dtm.head()

Unnamed: 0,09,10,100,11,12,13,14,15,16,17,...,zen,zest,zestful,zesty,zeta,zion,zkittlez,zombie,zone,zoning
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.307004,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
from sklearn.neighbors import NearestNeighbors
# Fit on dtm

nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(general_dtm)

# Save the trained model as a pickle string. 
saved_model = pickle.dumps(nn) 
  
# Load the pickled model 
nn_model = pickle.loads(saved_model) 
  
from sklearn.externals import joblib 
  
# Save the model as a pickle in a file 
joblib.dump(nn_model, 'nn_model.pkl') 
  
# Load the model from the file 
nn_model = joblib.load('nn_model.pkl')  



In [23]:
my_desc = [""" 
none,
pineapple,  peach, 
Depression
"""]

new = tfidf.transform(my_desc)
new

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [24]:
_, similar_topic_indices = nn.kneighbors(new.todense())

### Provides 5 strains based on the input (Example)

In [25]:
#columns = ['Strain', 'Type', 'Rating', 'Effects', 'Flavor', 'Description']

strains = [df['Strain'][t] for t in similar_topic_indices]
print('* - * - Strains - * - *')
print(strains, sep='\n\n')
print(end="\n\n")

types = [df['Type'][t] for t in similar_topic_indices]
print('* - * - Types - * - *')
print(types, sep='\n\n')
print(end="\n\n")

ratings = [df['Rating'][t] for t in similar_topic_indices]
print('* - * - Ratings - * - *')
print(ratings, sep='\n\n')
print(end="\n\n")

similar_flavors = [df['Flavor'][t] for t in similar_topic_indices]
print('* - * - Flavors - * - *')
print(similar_flavors, sep='\n\n')
print(end="\n\n")

similar_effects = [df['Effects'][t] for t in similar_topic_indices]
print('* - * - Effects - * - *')
print(similar_effects, sep='\n\n')
print(end="\n\n")

similar_topics = [df['Description'][t] for t in similar_topic_indices]
print('* - * - Descriptions - * - *')
print(similar_topics, sep='\n\n')

* - * - Strains - * - *
[768                Golden-Pineapple
1369                   Pineapple-Og
1267            Northwest-Pineapple
1375    Pineapple-Super-Silver-Haze
1371         Pineapple-Purple-Skunk
Name: Strain, dtype: object]


* - * - Types - * - *
[768     hybrid
1369    sativa
1267    hybrid
1375    sativa
1371    hybrid
Name: Type, dtype: object]


* - * - Ratings - * - *
[768     4.5
1369    4.6
1267    5.0
1375    4.7
1371    4.5
Name: Rating, dtype: float64]


* - * - Flavors - * - *
[768     Pineapple,Tropical,Citrus
1369    Pineapple,Citrus,Tropical
1267              Sweet,Pineapple
1375       Pineapple,Sweet,Earthy
1371       Sweet,Citrus,Pineapple
Name: Flavor, dtype: object]


* - * - Effects - * - *
[768      Happy,Euphoric,Uplifted,Relaxed,Creative
1369    Happy,Uplifted,Focused,Creative,Energetic
1267              Uplifted,Euphoric,Happy,Relaxed
1375      Happy,Euphoric,Energetic,Focused,Tingly
1371      Relaxed,Happy,Uplifted,Euphoric,Aroused
Name: Effects, dtyp

In [26]:
#df.to_csv('cannabis.csv')

In [28]:
## Save dataframe to a pickle
#df.to_pickle("dummy_with_tokens.pkl")

### Code to add spaces in between words in 'Effects', 'Flavor' columns

In [52]:
#def tokenize(text):
#    tokens = re.sub(r'[^a-zA-Z ^0-9]', ',', text)
#    tokens = tokens.replace(',', ' ')
#    tokens = tokens.split()
#    string = ', '.join(tokens)
#    return string

#df['Effects'] = df['Effects'].apply(tokenize)
#df['Flavor'] = df['Flavor'].apply(tokenize)

In [53]:
#df.to_csv('Cannabis_2.csv')

In [54]:
#with open('Strains.txt', 'w') as f:
#    strains = df['Strain'].to_json()
#    f.writelines(strains)