In [1]:
import pandas as pd
import re
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords, wordnet
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
%run "../config.py" # this imports variables from config.py as global

In [2]:
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [3]:
text_column_name = 'text'
stop_words = set(stopwords.words('english'))
ngram_range = (1, 2)
min_df = 2 # cut-off value for ignoring rare words
max_df = 1.0
max_features = 300
vectorizer = CountVectorizer(encoding='utf-8',
                             ngram_range=ngram_range,
                             stop_words=stop_words,
                             max_df=max_df,
                             min_df=min_df,
                             max_features=max_features)

In [4]:
# Load data
if data_filepath.endswith('.json'):
    data = pd.read_json(data_filepath, lines=True)
elif data_filepath.endswith('.csv'):
    data = pd.read_csv(data_filepath)
print(data.head(10))
tweets = data[text_column_name]

                    id                                               text
0  1229818548740087809  my professor for my class on climate change li...
1  1229818483191492608  Climate research by the rich #climatememe #cli...
2  1229818231533309956  @wgg7wgg @pocphotocompany @DavidASeattle Sande...
3  1229818181428170753  Inspired by Greta Thunberg, a 101-year-old cha...
4  1229817872211378176  Here’s the best place to move if you’re worrie...
5  1229817855283269633  Bezos’ climate pledge of $10 billion amounts t...
6  1229817652513845248  Paddington Green: inside the anti-terror HQ ta...
7  1229817515506851842  @WilHovaJr To say nothing of the fact that if ...
8  1229817365703090176  Industrial #IoT: a silver bullet for climate c...
9  1229817118146715649  How should Jeff Bezos invest his $10bn Earth F...


In [5]:
def preprocess(item):
    item = item.lower() # convert to lowercase
    item = " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in item.split()]) # lemmatizing
    item = item.replace('-', ' ') # replace dashes with whitespace
    # remove numbers, punctuation, tags and URLs
    item = re.sub(r'[^a-zA-Z ]+|(@[A-Za-z0-9]+)|http\S+', '', item)
    return item

In [6]:
preprocessed_tweets = tweets.apply(preprocess)
preprocessed_tweets.head(10)

0    my professor for my class on climate change li...
1    climate research by the rich climatememe clima...
2    wggwgg pocphotocompany davidaseattle sander be...
3    inspire by greta thunberg a  year old champion...
4    heres the best place to move if youre worried ...
5    bezos climate pledge of  billion amount to nea...
6    paddington green inside the anti terror hq tak...
7    wilhovajr to say nothing of the fact that if b...
8    industrial iot a silver bullet for climate cha...
9    how should jeff bezos invest his bn earth fund...
Name: text, dtype: object

In [7]:
features = vectorizer.fit_transform(preprocessed_tweets).toarray()
vectorizer.get_feature_names()

['advance',
 'advance climate',
 'ag',
 'ag office',
 'agenda',
 'air',
 'amazon',
 'amp',
 'analyses',
 'anyway',
 'become',
 'best',
 'bezos',
 'bezos climate',
 'bezos pledge',
 'billion',
 'billion fight',
 'bloomberg',
 'bloomberg program',
 'business',
 'care',
 'carriept',
 'carriept lisavanhoosept',
 'cbarnespt',
 'cbarnespt markmilligandpt',
 'center',
 'change',
 'change agenda',
 'chrishinzept',
 'chrishinzept jhaleatx',
 'climate',
 'climate change',
 'climate justice',
 'climate pledge',
 'climatechange',
 'coal',
 'come',
 'commits',
 'commits billion',
 'current',
 'current political',
 'dawnmagnusson',
 'dawnmagnusson drkaidpt',
 'dont',
 'drkaidpt',
 'drkaidpt chrishinzept',
 'economic',
 'ecosearch',
 'ecosearch news',
 'environment',
 'environment ecosearch',
 'environmental',
 'feel',
 'fight',
 'fight climate',
 'focus',
 'footprint',
 'fuel',
 'fund',
 'generation',
 'get',
 'give',
 'gun',
 'heres',
 'house',
 'impact',
 'important',
 'inspire',
 'jeff',
 'jeff b