In [46]:
import pandas as pd
import re
import json
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords, wordnet
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
%run "../config.py" # this imports variables from config.py as global

In [51]:
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [59]:
text_column_name = 'text'
stop_words = set(stopwords.words('english'))
ngram_range = (1, 2)
min_df = 5 # cut-off value for ignoring rare words
max_df = 1.0
max_features = 500
vectorizer = CountVectorizer(encoding='utf-8',
                             ngram_range=ngram_range,
                             stop_words=stop_words,
                             max_df=max_df,
                             min_df=min_df,
                             max_features=max_features)

In [48]:
# Load data
if data_filepath.endswith('.json'):
    #data = pd.read_json(data_filepath, lines=True, encoding="utf8")
    data_arr = []
    column_names = ['id', text_column_name]
    with open(data_filepath, 'r') as f:
        for tweet in f:
            selected_row = []
            json_tweet = json.loads(tweet)
            try:
                # filter out retweets and non-English tweets
                if not json_tweet['retweeted'] and 'RT @' not in json_tweet[text_column_name] and json_tweet['lang'] == 'en':
                    for col in column_names:
                        selected_row.append(json_tweet[col])
                    data_arr.append(selected_row)
            except KeyError:
                pass
        data = pd.DataFrame(data_arr, columns=column_names)
elif data_filepath.endswith('.csv'):
    data = pd.read_csv(data_filepath)[column_names]
print(data.size)
print(data.head(10))
tweets = data[text_column_name]

52244
                    id                                               text
0  1236315739428192259  “ May I get your name? Mine is 8... “\n\nShe w...
1  1236315739667279872  MOVE AUBA IN THE MIDDLE AND BRING ON NELSON/MA...
2  1236315739713187842  @giantlittleman Im so happy for u yet so jealo...
3  1236315739805618183                                   Call me ice baby
4  1236315739709210625  @ZonePhysics I wonder what the CO2 count is, h...
5  1236315739512016897  @GWRHelp It was booked in advance, to confirm ...
6  1236315739436593153  Jed-Forest v Hawick Rugby Live Stream Iphone 7...
7  1236315739675557889  @Cyal8er3 @Beeeelzebub888 @UnCastellsMes @badi...
8  1236315739939909634  Dr. Wilson came and got my baby for his circum...
9  1236315739931492352  @cmclymer @MsNebraskaJones We have videos of h...


In [49]:
def preprocess(item):
    item = item.lower() # convert to lowercase
    item = " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in item.split()]) # lemmatizing
    item = item.replace('-', ' ') # replace dashes with whitespace
    # remove numbers, punctuation, tags and URLs
    item = re.sub(r'[^a-zA-Z ]+|(@[A-Za-z0-9]+)|http\S+', '', item)
    return item

In [61]:
preprocessed_tweets = tweets.apply(preprocess)
preprocessed_tweets.head(10)

0     may i get your name mine be   she would pat h...
1    move auba in the middle and bring on nelsonmar...
2    giantlittleman im so happy for u yet so jealou...
3                                     call me ice baby
4    zonephysics i wonder what the co count is how ...
5    gwrhelp it be book in advance to confirm this ...
6    jed forest v hawick rugby live stream iphone t...
7    cyaler beeeelzebub uncastellsmes badibulgator ...
8    dr wilson come and get my baby for his circumc...
9    cmclymer msnebraskajones we have video of him ...
Name: text, dtype: object

In [63]:
features = vectorizer.fit_transform(preprocessed_tweets).toarray()
vectorizer.get_feature_names()

['absolutely',
 'account',
 'action',
 'actually',
 'add',
 'adventure',
 'adventure call',
 'agree',
 'aint',
 'already',
 'also',
 'always',
 'amaze',
 'american',
 'amp',
 'another',
 'anyone',
 'anything',
 'aquarius',
 'aries',
 'around',
 'ask',
 'attack',
 'away',
 'awesome',
 'baby',
 'back',
 'bad',
 'bc',
 'beautiful',
 'believe',
 'bernie',
 'berniesanders',
 'best',
 'biden',
 'big',
 'big energy',
 'birthday',
 'bit',
 'black',
 'book',
 'boy',
 'break',
 'bring',
 'bro',
 'business',
 'buy',
 'call',
 'call passionate',
 'cancer',
 'cant',
 'cant wait',
 'capricorn',
 'car',
 'care',
 'case',
 'cause',
 'chance',
 'chance cancer',
 'change',
 'check',
 'child',
 'city',
 'close',
 'code',
 'come',
 'come many',
 'conti',
 'conti libra',
 'continue',
 'continue move',
 'coronavirus',
 'could',
 'could go',
 'country',
 'coupon',
 'covid',
 'crisis',
 'damn',
 'day',
 'didnt',
 'die',
 'discount',
 'dm',
 'doesnt',
 'domestic',
 'domestic front',
 'dont',
 'dont know',
 'dr