In [28]:
import pandas as pd

df = pd.read_csv('./cannabis_raw.csv')

In [29]:
print(df.shape)
df.head()

(2351, 6)


Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [30]:
df.isnull().any()

Strain         False
Type           False
Rating         False
Effects        False
Flavor          True
Description     True
dtype: bool

In [31]:
df = df[df['Description'].notnull()]
df = df[df['Flavor'].notnull()]
df = df[df['Rating']>4]
df.reset_index(inplace = True) 

In [32]:
df.shape

(2012, 7)

In [33]:
"""
Import Statements
"""

# Base
from collections import Counter
import re

# NLP Libraries
import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer

nlp = spacy.load("en_core_web_lg")

In [34]:
import re
def tokenize(text):
    """Parses a string into a list of semantic units (words)

    Args:
        text (str): The string that the function will tokenize.

    Returns:
        list: tokens parsed out by the mechanics of your choice
    """
    
    tokens = re.sub(r'[^a-zA-Z ^0-9]', ',', text)
    tokens = tokens.lower().replace(',', ' ')
    tokens = tokens.split()
    
    return tokens

In [35]:
df['Effects_tokens'] = df['Effects'].apply(tokenize)
df['Effects_tokens'].head()

0     [relaxed, aroused, creative, happy, energetic]
1    [uplifted, happy, relaxed, energetic, creative]
2      [tingly, creative, hungry, relaxed, uplifted]
3    [happy, relaxed, euphoric, uplifted, talkative]
4       [relaxed, euphoric, happy, uplifted, hungry]
Name: Effects_tokens, dtype: object

In [36]:
df['Flavor_tokens'] = df['Flavor'].apply(tokenize)
df['Flavor_tokens'].head()

0        [flowery, violet, diesel]
1     [spicy, herbal, sage, woody]
2    [apricot, citrus, grapefruit]
3         [citrus, earthy, orange]
4         [earthy, sweet, pungent]
Name: Flavor_tokens, dtype: object

In [37]:
# Object from Base Python
from collections import Counter

# The object `Counter` takes an iterable, but you can instaniate an empty one and update it. 
word_counts = Counter()

def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(list(temp), columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc

In [38]:
wc_effects = count(df['Effects_tokens'])
print(wc_effects.shape)
wc_effects.sort_values(by='rank')

(14, 7)


Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
1,happy,1699,1699,1.0,0.171564,0.171564,0.844433
0,relaxed,1571,1571,2.0,0.158639,0.330203,0.780815
9,euphoric,1474,1474,3.0,0.148844,0.479047,0.732604
5,uplifted,1345,1345,4.0,0.135817,0.614864,0.668489
4,creative,652,652,5.0,0.065839,0.680703,0.324056
12,sleepy,646,646,6.0,0.065233,0.745936,0.321074
2,energetic,566,566,7.0,0.057154,0.80309,0.281312
11,focused,532,532,8.0,0.053721,0.856811,0.264414
7,hungry,407,407,9.0,0.041099,0.89791,0.202286
8,talkative,303,303,10.0,0.030597,0.928507,0.150596


In [39]:
wc_flavor = count(df['Flavor_tokens'])
print(wc_flavor.shape)
wc_flavor.sort_values(by='rank')

(51, 7)


Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
11,earthy,998,998,1.0,0.162753,0.162753,0.496024
13,sweet,971,971,2.0,0.15835,0.321102,0.482604
8,citrus,475,475,3.0,0.077462,0.398565,0.236083
12,pungent,410,410,4.0,0.066862,0.465427,0.203777
18,berry,336,336,5.0,0.054795,0.520222,0.166998
15,pine,276,276,6.0,0.04501,0.565232,0.137177
2,flowery,241,241,7.0,0.039302,0.604534,0.119781
3,woody,228,228,8.0,0.037182,0.641716,0.11332
1,diesel,220,220,9.0,0.035877,0.677593,0.109344
6,spicy,205,205,10.0,0.033431,0.711024,0.101889


In [40]:
print(len(wc_effects['word'].to_list()))
wc_effects = wc_effects.sort_values(by='rank')
wc_effects['word'].to_list()

14


['happy',
 'relaxed',
 'euphoric',
 'uplifted',
 'creative',
 'sleepy',
 'energetic',
 'focused',
 'hungry',
 'talkative',
 'tingly',
 'giggly',
 'aroused',
 'none']

In [41]:
wc_flavor = wc_flavor.sort_values(by='rank')
print(len(wc_flavor['word'].to_list()))
wc_flavor['word'].to_list()[:26]

51


['earthy',
 'sweet',
 'citrus',
 'pungent',
 'berry',
 'pine',
 'flowery',
 'woody',
 'diesel',
 'spicy',
 'herbal',
 'lemon',
 'skunk',
 'tropical',
 'blueberry',
 'grape',
 'orange',
 'cheese',
 'pepper',
 'lime',
 'strawberry',
 'grapefruit',
 'sage',
 'minty',
 'pineapple',
 'none']

### Flavors: 
Earthy, Sweet, Citrus, Pungent, Berry, Pine, Flowery, Woody,
Spicy, Herbal, Lemon, Lavender, Tropical, Blueberry, Grape,
Orange, Pepper, Lime, Strawberry, Grapefruit, Sage,
Minty, Pineapple, None, Vanilla, Apple.

### Effects: 
Happy, Relaxed, Euphoric, Uplifted, Creative, Sleepy, Energetic,
Focused, Hungry, Talkative, Tingly, Giggly, Aroused, None.

### Ailments:
Depression, Inflammation, Insomnia, Lack of Appetite, Muscle Spasms,
Nausea, Pain, Seizures, Stress, Anxiety, Headaches, Fatigue.

In [42]:
df['Description'] = df['Description'].str.replace(u'\xa0', u' ')
df['Description'].head()

0    The ‘98 Aloha White Widow is an especially pot...
1    1024 is a sativa-dominant hybrid bred in Spain...
2    13 Dawgs is a hybrid of G13 and Chemdawg genet...
3    Also known as Kosher Tangie, 24k Gold is a 60%...
4    The 3 Kings marijuana strain, a holy trinity o...
Name: Description, dtype: object

In [43]:
# Tokenizer Pipe
from spacy.tokenizer import Tokenizer

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

tokens = []
STOP_WORDS = nlp.Defaults.stop_words.union([' '])

""" Make them tokens """
for doc in tokenizer.pipe(df['Description'], batch_size=500):
    doc_tokens = []   
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower()) 
    tokens.append(doc_tokens)

df['Description_tokens'] = tokens

In [44]:
#wc_description = count(df['Description_tokens'])
#print(wc_description.shape)
#wc_description.head()

In [45]:
df['General_Description'] = df['Effects']+""+df['Flavor']+""+df['Description']

In [46]:
df['General_Description'][0]

'Relaxed,Aroused,Creative,Happy,EnergeticFlowery,Violet,DieselThe ‘98 Aloha White Widow is an especially potent cut of White Widow that has grown in renown alongside Hawaiian legends like Maui Wowie and Kona Gold. This White Widow phenotype reeks of diesel and skunk and has a rich earthy taste with intermittent notes of hash. Its buds are coated in trichomes, giving its dark foliage a lustrous glint to go along with its room-filling odor. This one-hitter-quitter uplifts the mind with mind-bending euphoria that materializes in the body as airy relaxation. ‘98 Aloha White Widow is available from Pua Mana 1st Hawaiian Pakalōlō Seed Bank.  '

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
dtm = tfidf.fit_transform(df['General_Description'])

# Print word counts

# Get feature names to use as dataframe column headers
general_dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
general_dtm.head()

Unnamed: 0,09,10,100,11,12,13,14,15,16,17,...,zen,zest,zestful,zesty,zeta,zion,zkittlez,zombie,zone,zoning
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.307015,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
from sklearn.neighbors import NearestNeighbors
# Fit on dtm

nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(general_dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [49]:
my_desc = [""" 
none,
pineapple,  peach, 
Depression
"""]

new = tfidf.transform(my_desc)
new

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [50]:
_, similar_topic_indices = nn.kneighbors(new.todense())

### Provides 5 strains based on the input (Example)

In [51]:
#columns = ['Strain', 'Type', 'Rating', 'Effects', 'Flavor', 'Description']

strains = [df['Strain'][t] for t in similar_topic_indices]
print('* - * - Strains - * - *')
print(strains, sep='\n\n')
print(end="\n\n")

types = [df['Type'][t] for t in similar_topic_indices]
print('* - * - Types - * - *')
print(types, sep='\n\n')
print(end="\n\n")

ratings = [df['Rating'][t] for t in similar_topic_indices]
print('* - * - Ratings - * - *')
print(ratings, sep='\n\n')
print(end="\n\n")

similar_flavors = [df['Flavor'][t] for t in similar_topic_indices]
print('* - * - Flavors - * - *')
print(similar_flavors, sep='\n\n')
print(end="\n\n")

similar_effects = [df['Effects'][t] for t in similar_topic_indices]
print('* - * - Effects - * - *')
print(similar_effects, sep='\n\n')
print(end="\n\n")

similar_topics = [df['Description'][t] for t in similar_topic_indices]
print('* - * - Descriptions - * - *')
print(similar_topics, sep='\n\n')

* - * - Strains - * - *
[770                Golden-Pineapple
1371                   Pineapple-Og
1269            Northwest-Pineapple
1377    Pineapple-Super-Silver-Haze
1373         Pineapple-Purple-Skunk
Name: Strain, dtype: object]


* - * - Types - * - *
[770     hybrid
1371    sativa
1269    hybrid
1377    sativa
1373    hybrid
Name: Type, dtype: object]


* - * - Ratings - * - *
[770     4.5
1371    4.6
1269    5.0
1377    4.7
1373    4.5
Name: Rating, dtype: float64]


* - * - Flavors - * - *
[770     Pineapple,Tropical,Citrus
1371    Pineapple,Citrus,Tropical
1269              Sweet,Pineapple
1377       Pineapple,Sweet,Earthy
1373       Sweet,Citrus,Pineapple
Name: Flavor, dtype: object]


* - * - Effects - * - *
[770      Happy,Euphoric,Uplifted,Relaxed,Creative
1371    Happy,Uplifted,Focused,Creative,Energetic
1269              Uplifted,Euphoric,Happy,Relaxed
1377      Happy,Euphoric,Energetic,Focused,Tingly
1373      Relaxed,Happy,Uplifted,Euphoric,Aroused
Name: Effects, dtyp

In [None]:
#df.to_csv('cannabis.csv')

### Code to add spaces in between words in 'Effects', 'Flavor' columns

In [52]:
#def tokenize(text):
#    tokens = re.sub(r'[^a-zA-Z ^0-9]', ',', text)
#    tokens = tokens.replace(',', ' ')
#    tokens = tokens.split()
#    string = ', '.join(tokens)
#    return string

#df['Effects'] = df['Effects'].apply(tokenize)
#df['Flavor'] = df['Flavor'].apply(tokenize)

In [53]:
#df.to_csv('Cannabis_2.csv')

In [54]:
#with open('Strains.txt', 'w') as f:
#    strains = df['Strain'].to_json()
#    f.writelines(strains)