# [AttributeError when building list comprehension for Wordnet.Synsets().Definition()](https://stackoverflow.com/questions/52392130/attributeerror-when-building-list-comprehension-for-wordnet-synsets-definition/52394042?noredirect=1#comment91733736_52394042)

In [None]:
import pandas as pd
from pprint import pprint as pp
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

# these two methods do the same thing
from nltk.tokenize import TreebankWordTokenizer as tok
from nltk.tokenize import word_tokenize

In [None]:
df = pd.read_csv('data/2018-09-18_CIP.csv')

In [None]:
df.head()

In [None]:
english_stops = set(stopwords.words('english'))

In [None]:
# Gets synsets for a given term.

def get_synset(word):
    for word in wn.synsets(word):
        return word.name()

In [None]:
#Gets definitions for a synset.

def get_def(syn):
    return wn.synset(syn).definition()

In [None]:
# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.

def sector_tagger(frame):
    """
    Receives each row of the DataFrame column - using apply
    Transform the row using NLTK to produce a DataFrame series with Categories
    of individual words, synsets, and word definitions.
    :parameter: frame
    :type: string
    :return: sector_matrix, synset, def_matrix
    :type: pandas.core.series.Series
    """
    mapping = [('/', ' '), ('(', ''), (')', ''), (',', '')]
    for k, v in mapping:
        frame = frame.replace(k, v)
        
    # these two lines do the same thing with different methods
#     tok_list = tok().tokenize(frame)
    tok_list = word_tokenize(frame)

    split_words = [w.lower() for w in tok_list]
    clean_words = [w for w in split_words if w not in english_stops]
    synset = [get_synset(w) for w in clean_words]
    def_matrix = [get_def(w) if w != None else '' for w in synset]
    return clean_words, synset, def_matrix

In [None]:
agri_set = df['Category'].apply(sector_tagger)

### each phrase passed in returns a list clean_words, synsets and phrases

In [None]:
agri_set[0]

# Create a DataFrame where each column is a list

In [None]:
list_clean_words = []
list_synset = []
list_def_matrix = []
for x in agri_set:
    list_clean_words.append(x[0])
    list_synset.append(x[1])
    list_def_matrix.append(x[2])

In [None]:
agri_matrix = pd.DataFrame()

In [None]:
agri_matrix['Categories'] = list_clean_words

In [None]:
agri_matrix['Synsets'] = list_synset

In [None]:
agri_matrix['Definition'] = list_def_matrix

In [None]:
agri_matrix.head()

# Alternatively, split each list of lists into a long list (they're ordered)

In [None]:
def create_long_list_from_list_of_lists(list_of_lists):
    long_list = []
    for one_list in list_of_lists:
        for word in one_list:
            long_list.append(word)
    return long_list

In [None]:
long_list_clean_words = create_long_list_from_list_of_lists(list_clean_words)
long_list_synset = create_long_list_from_list_of_lists(list_synset)
long_list_def_matrix = create_long_list_from_list_of_lists(list_def_matrix)

# Turn it into a DataFrame of Uniques Categories

In [None]:
agri_df = pd.DataFrame.from_dict(dict([('Categories', long_list_clean_words), ('Synsets', long_list_synset), ('Definitions', long_list_def_matrix)])).drop_duplicates().reset_index(drop=True)

In [None]:
agri_df.head()

# Alternate sector_tagger Method

In [None]:
# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.

def sector_tagger(frame):
    """
    Receives one entire DataFrame column not a row, like with apply
    Transform the column using NLTK to produce a DataFrame with Categories
    of individual words, synsets, and word definitions.
    :parameter: frame
    :type: pandas.core.series.Series
    :return: sector_matrix
    :type: pandas.core.frame.DataFrame
    """

    mapping = [('/', ' '), ('(', ''), (')', ''), (',', '')]
    for k, v in mapping:
        frame = frame.apply(lambda row: row.replace(k, v))
        
    # these two lines do the same thing with different methods
#     tok_list = [tok().tokenize(w) for w in frame]
    tok_list = [word_tokenize(w) for w in frame]

    split_words = [w.lower() for sub in tok_list for w in sub]
    clean_words = [w for w in split_words if w not in english_stops]
    synset = [get_synset(w) for w in clean_words]
    
    sector_matrix = pd.DataFrame({'Categories': clean_words, 'Synsets': synset})
    sec_syn = list(sector_matrix['Synsets'])
    sector_matrix['Definition'] = [get_def(w) if w != None else '' for w in sec_syn]
    sector_matrix = sector_matrix.drop_duplicates().reset_index(drop=True)
    return sector_matrix

In [None]:
agri_matrix = sector_tagger(df['Category'])

In [None]:
agri_matrix.head()

# Original Code

In [None]:
import pandas as pd
from pandas import DataFrame, Series
import nltk.data
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TreebankWordTokenizer


tok = TreebankWordTokenizer()
english_stops = set(stopwords.words('english'))
french_stops = set(stopwords.words('french'))


# Gets synsets for a given term.

def get_synset(word):
    for word in wn.synsets(word):
        return word.name()

#Gets definitions for a synset.

def get_def(syn):
    return wn.synsets(syn).definition()

# Creates a dataframe called sector_matrix based on another dataframe's column. Should be followed with an export.

def sector_tagger(frame):
    sentences = frame.tolist()
    tok_list = [tok.tokenize(w) for w in frame]
    split_words = [w.lower() for sub in tok_list for w in sub]
    clean_words = [w for w in split_words if w not in english_stops]
    synset = [get_synset(w) for w in clean_words]
    sector_matrix = DataFrame({'Categories': clean_words,
                               'Synsets': synset})
    sec_syn = sector_matrix['Synsets'].tolist()
    sector_matrix['Definition'] = [get_def(w) for w in sector_matrix['Synsets']]
    return sector_matrix

test = pd.read_csv('data/2018-09-18_CIP.csv')

agri_matrix = sector_tagger(test['Category'])