In [262]:
import pandas as pd

data_path = 'data.csv'
df = pd.read_csv(data_path, header=0)
df['Habitat Comments'] = df['Habitat Comments'].transform(lambda x: str(x))
df.head()

Unnamed: 0,Scientific Name,Habitat Comments
0,Acris crepitans,This species inhabits the edges of sunny marsh...
1,Acris gryllus,"Grassy margins of swamps, marshes, lakes, pond..."
2,Ambystoma bishopi,The following information pertains to the <i>A...
3,Ambystoma cingulatum,The following information pertains to the <i>A...
4,Ambystoma maculatum,Spotted salamanders inhabit hardwood and mixed...


In [263]:
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tylercalhoun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tylercalhoun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
def remove_stopwords(habitat_comment):
    tokens = nltk.RegexpTokenizer(r'\w+').tokenize(habitat_comment)
    no_stops = [w for w in tokens if not w.lower() in stop_words]
    return no_stops

def remove_not_nouns(habitat_comment):
    tagged = nltk.tag.pos_tag(habitat_comment)
    return [w for w, pos in tagged if pos.startswith('N')]

def lemmatize_comment(comment):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w) for w in comment]
    return lemmatized

df['Habitat Comments'] = df['Habitat Comments'].transform(lambda x: BeautifulSoup(x, 'html.parser').text)
df['Habitat Comments'] = df['Habitat Comments'].transform(lambda x: remove_stopwords(x))
df['Habitat Comments'] = df['Habitat Comments'].transform(lambda x: lemmatize_comment(x))
df['Habitat Comments'] = df['Habitat Comments'].transform(lambda x: set(x))

df['Habitat Comments'].head()

  df['Habitat Comments'] = df['Habitat Comments'].transform(lambda x: BeautifulSoup(x, 'html.parser').text)


0    {edge, marsh, ditch, site, Hibernation, Trauth...
1    {shade, marsh, ditch, excessive, habitat, swam...
2    {swamp, baldwinii, Magnolia, Harper, 9, Larvae...
3    {swamp, baldwinii, Magnolia, Harper, 9, Larvae...
4    {burrow, period, stump, swamp, breeding, verna...
Name: Habitat Comments, dtype: object

In [None]:
def incidence_vector(words, habitats):
    incidence = [0] * len(habitats)
    for i, habitat in enumerate(habitats):
        for word in words:
            if word == habitat:
                incidence[i] = 1
    return incidence

with open("habitats.txt", "r") as file:
    habitats = file.read()
    habitats = habitats.split()
    
    df['Habitat Comments'] = df['Habitat Comments'].transform(lambda x: incidence_vector(x, habitats))
   
    file.close()

df['Habitat Comments'].head()


0    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
Name: Habitat Comments, dtype: object

In [267]:
df.to_csv("habitat_comments.csv")