## NLP Class 3 Exercise 1:
- Read tweets into Pandas Dataframe
- Identify Bigrams and Trigrams for the top frequently mentioned AI / ML / NLP technologies

**Suggestions:** 
- Eliminate URLs, Mentions, Hashtags, RTs and newline characters
- Clean-up n-grams by eliminating punctuation, number, stopwords and lowercasing the text
- Add custom stopwords filters to get more relevant results

In [1]:
#import nltk
#nltk.download('popular', halt_on_error=False)
#nltk.download('all', halt_on_error=False)

In [1]:
import nltk as nltk
import nltk.corpus  
from nltk.text import Text
import pandas as pd
import re
import sys

In [2]:
print(sys.version)

3.10.9 (tags/v3.10.9:1dd9be6, Dec  6 2022, 20:01:21) [MSC v.1934 64 bit (AMD64)]


### Load Data

In [3]:
url = 'https://storage.googleapis.com/msca-bdp-data-open/tweets/tweets_ai_ml_nlp.json'
tweets = pd.read_json(url, orient='records', lines=True)

print(f'Dataset records: {tweets.shape[0]}, Dataset columns: {tweets.shape[1]}')

Dataset records: 100043, Dataset columns: 7


In [4]:
tweets.head(5)

Unnamed: 0,id,lang,date,name,text,extended_text,quoted_text
0,1529094548005064705,en,2022-05-24,odol☘️,RT @Frank4NC: CodyFight is a must watch and mu...,CodyFight is a must watch and must EARN! Get r...,Codyfight is a place where Humans and #AI comp...
1,1529094585942568960,en,2022-05-24,Paijo s'Bejo,RT @Bakercrypt0: Wonderful day to everybody! ✨...,Wonderful day to everybody! ✨🫶\n\nThe trailer ...,Codyfight is a place where Humans and #AI comp...
2,1529094709771051013,en,2022-05-24,🍀Ging🍀6️⃣5️⃣🎹,RT @Frank4NC: CodyFight is a must watch and mu...,CodyFight is a must watch and must EARN! Get r...,Codyfight is a place where Humans and #AI comp...
3,1529094719120510976,en,2022-05-24,Ultra mild🗯💫,RT @codyfight: Codyfight is a place where Huma...,Codyfight is a place where Humans and #AI comp...,
4,1529094845393907712,en,2022-05-24,Ohayou🌼,RT @ninasimonic: Wonderful day to everybody! ✨...,Wonderful day to everybody! ✨🫶\n\nTheir traile...,Codyfight is a place where Humans and #AI comp...


#### Use TweetTokenizer to tokenize Tweets

In [6]:
def remove_content(string):
    url_pattern = r'(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    mention_pattern = r'[\s]*@[\w]+'
    hashtag_pattern = r'[\s]*#[\w]+'
    string_fixed = re.sub(url_pattern,"",string)
    string_fixed = re.sub(hashtag_pattern,"",string_fixed)
    string_fixed = re.sub(mention_pattern,"",string_fixed)
    return string_fixed
tweets["fixed_extended"] = tweets["extended_text"].apply(remove_content)

In [17]:
tweet_text = tweets['fixed_extended'].str.lower().str.replace(r'\|', ' ', regex=True).str.cat(sep=' ')
tweet_tokenizer = nltk.tokenize.TweetTokenizer()
words = tweet_tokenizer.tokenize(tweet_text)
stopwords = set(nltk.corpus.stopwords.words('english'))
words = [word for word in words if word.isalpha()]
words = [word for word in words if not word.isnumeric()]
words = [word.lower() for word in words]
words = [word for word in words if word not in stopwords]
bgs = nltk.bigrams(words)
targeted_bgs = ['machine','learning','ai','artificial','intelligence','natural','language','processing','chatgpt','data','science','python','r','c','analytics','ml','nlp']
bgs = [b for b in bgs if (b[0] in targeted_bgs) or (b[1] in targeted_bgs)]
bigrams_freq = nltk.FreqDist(bgs)
bigrams_freq_df = pd.DataFrame(bigrams_freq.most_common(),columns=['Word', 'Frequency'])
bigrams_freq_df.head(n=20)

Unnamed: 0,Word,Frequency
0,"(machine, learning)",3280
1,"(artificial, intelligence)",3069
2,"(data, science)",2032
3,"(analytics, team)",1172
4,"(insights, analytics)",1170
5,"(deep, learning)",1046
6,"(big, data)",461
7,"(learning, python)",374
8,"(data, scientist)",351
9,"(data, analytics)",340


In [25]:
tweet_text = tweets['fixed_extended'].str.lower().str.replace(r'\|', ' ', regex=True).str.cat(sep=' ')
tweet_tokenizer = nltk.tokenize.TweetTokenizer()
words = tweet_tokenizer.tokenize(tweet_text)
stopwords = set(nltk.corpus.stopwords.words('english'))
words = [word for word in words if word.isalpha()]
words = [word for word in words if not word.isnumeric()]
words = [word.lower() for word in words]
words = [word for word in words if word not in stopwords]
bgs = nltk.trigrams(words)
targeted_bgs = ['machine','learning','ai','artificial','intelligence','natural','language','processing','chatgpt','data','science','python','r','c','analytics','ml','nlp']
bgs = [b for b in bgs if (b[0] in targeted_bgs) or (b[1] in targeted_bgs)]
trigrams_dist = nltk.FreqDist(bgs)
trigrams_dist_df = pd.DataFrame(trigrams_dist.most_common(),columns=['Word', 'Frequency'])
trigrams_dist_df.head(n=20)

Unnamed: 0,Word,Frequency
0,"(insights, analytics, team)",1170
1,"(analytics, team, using)",668
2,"(analytics, team, usafacts)",502
3,"(learning, data, science)",235
4,"(machine, learning, data)",215
5,"(deep, learning, python)",203
6,"(artificial, intelligence, ai)",177
7,"(trending, ai, ml)",160
8,"(ai, ml, article)",160
9,"(ml, article, identified)",160


In [8]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Wed, 05 April 2023 20:21:53'