In [22]:
#Import libraries
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline

In [23]:
#Read the data
tweet_df = pd.read_csv("./tweet_df.csv")
tweet_df.head(3)

Unnamed: 0.1,Unnamed: 0,tweets,user_location
0,0,"After more than a year of #workfromhome life, ...","Chagrin Falls, OH 44023"
1,1,Start a business without being tech-savvy\n\nS...,Minnesota
2,2,Easy Article Writing tips - Affiliate Working ...,


In [24]:
#Drop Unnamed: 0 column
tweet_df.drop('Unnamed: 0', axis=1, inplace=True)

In [25]:
#Check to confirm if the column got deleted 
tweet_df.head(3)

Unnamed: 0,tweets,user_location
0,"After more than a year of #workfromhome life, ...","Chagrin Falls, OH 44023"
1,Start a business without being tech-savvy\n\nS...,Minnesota
2,Easy Article Writing tips - Affiliate Working ...,


In [26]:
p_stemmer = PorterStemmer() # Instantiate PortStemmer
tokenizer = RegexpTokenizer(pattern=r'\w+') #collect words only
words = list(map(lambda x: x.lower(), tweet_df['tweets'])) #convert words in string to lowercase
tokenized_words = [tokenizer.tokenize(x) for x in words]  #convert one long string into a list of strings
clean_column_list = []
for word_list in tokenized_words:
    ##itearate each row
    cleanlist = []
    for word in word_list:
        ##itearate each word in row
        if word not in stopwords.words('english'): #removing  stop words
            cleanlist.append(p_stemmer.stem(word)) #using stemmer take words and attempt to return a base form of the word
    ##convert each clean row list to string and append to main list         
    clean_column_list.append((" ").join(cleanlist))

In [33]:
# Place new lists of stemmed words into the 'tweets' column in the final dataframe
tweet_df['tweets'] = clean_column_list
# Check to see that the 'tweets' column is how it should look
tweet_df.head(3)

Unnamed: 0,tweets,user_location,polarity
0,year workfromhom life could use chang sceneri ...,"Chagrin Falls, OH 44023",0
1,start busi without tech savvi see blog post ht...,Minnesota,0
2,easi articl write tip affili work onlin http c...,,0


In [28]:
sent = pipeline('sentiment-analysis')

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
def getPolarity(tweet):
    polarity_dic = sent(tweet)
    if polarity_dic[0]['label'] == 'POSITIVE':
        return 1
    else:
        return 0

In [32]:
tweet_df['polarity'] = tweet_df['tweets'].map(getPolarity)

In [34]:
tweet_df

Unnamed: 0,tweets,user_location,polarity
0,year workfromhom life could use chang sceneri ...,"Chagrin Falls, OH 44023",0
1,start busi without tech savvi see blog post ht...,Minnesota,0
2,easi articl write tip affili work onlin http c...,,0
3,call sunday night changeofplan changeofdirect ...,Pune,0
4,market vacat rental take covid 19 remot work t...,,0
...,...,...,...
4995,rt remoteio_job quickli find great remotejob l...,,0
4996,rt onlineprobret http co 5nozgibwpi done succe...,oman,0
4997,rt skittercom traffic bitch pleas retweet http...,Donde estoy ubicado,0
4998,rt mayhemmarriag worri formula equat write tho...,,0


In [35]:
X = tweet_df['tweets']

In [36]:
cv = CountVectorizer(min_df=2,                   
                    ngram_range=(1,5)) 

In [37]:
cv_X = cv.fit_transform(X)

In [38]:
# Create cv_train dataframe
cv_X_df = pd.DataFrame(cv_X.toarray(), columns = cv.get_feature_names())
cv_X_df.head()

Unnamed: 0,000,000 http,000 http co,000 month,000 month free,000 month free webinar,000 month month,000 month month watch,000 month month watch video,000 onlin,...,𝗥𝗘𝗠𝗢𝗧𝗘 job seo,𝗥𝗘𝗠𝗢𝗧𝗘 job social,𝗥𝗘𝗠𝗢𝗧𝗘 job social media,𝗥𝗘𝗠𝗢𝗧𝗘 job social media coordin,𝗥𝗘𝗠𝗢𝗧𝗘 job social media manag,𝗥𝗘𝗠𝗢𝗧𝗘 job ux,𝗥𝗘𝗠𝗢𝗧𝗘 job ux design,𝗥𝗘𝗠𝗢𝗧𝗘 job video,𝗦𝗲𝗿𝘃𝗶𝗰𝗲,𝗦𝗲𝗿𝘃𝗶𝗰𝗲 role
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
#commonwords
cv_X_df.sum().sort_values(ascending=False).head(12)

co             4935
http           4932
http co        4896
workfromhom    3837
work           1789
rt             1713
job            1171
home            937
work home       585
busi            584
remot           581
remotework      530
dtype: int64