# Preprocess tweets - from clean_tweets pickle
#### Text is already cleaned in step 1. In this step- tokenize, remove stop words, vectorize...

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set()
#nlp preprocessing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from nltk.probability import FreqDist

In [3]:
## Load cleaned data from pickle
#read files from the folder
path = '/content/drive/MyDrive/Guvi Files/Final Project2(Tweets)/'
train_tweets = pd.read_pickle(path+'data/train_tweets_clean.pkl')
test_tweets = pd.read_pickle(path+'data/test_tweets_clean.pkl')


In [4]:
train_tweets.head()

Unnamed: 0,label,tweet,clean_tweet
0,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so sel...
1,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they don...
2,0,bihday your majesty,bihday your majesty
3,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur
4,0,factsguide: society now #motivation,factsguide society now


In [5]:
test_tweets.head()

Unnamed: 0,tweet,clean_tweet
0,#studiolife #aislife #requires #passion #dedic...,to find
1,@user #white #supremacists want everyone to s...,want everyone to see the new and heres...
2,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your
3,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,"3rd #bihday to my amazing, hilarious #nephew...",to my amazing hilarious eli ahmir uncle dav...


In [6]:
#extract train tweets & labels separately
train_clean_tweets = train_tweets['clean_tweet']
train_target = train_tweets['label']


## Tokenize and remove stop words

In [7]:
#remove stop words in train_clean_tweets
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')
def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
processed_data = list(map(tokenize_and_remove_stopwords,train_clean_tweets))
print(processed_data)



In [9]:
total_vocab = set()
for comment in processed_data:
    total_vocab.update(comment)
len(total_vocab)

22203

In [10]:
# store the processed data
with open(path+'data/processed_data.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

## Top words in corpus

In [11]:
# morphing `processed_data` into a readable list
flat_filtered = [item for sublist in processed_data for item in sublist]
# getting frequency distribution
clean_corpus_freqdist = FreqDist(flat_filtered)
# top 20 words in cleaned corpus
clean_corpus_freqdist.most_common(20)

[('day', 2026),
 ('happy', 1575),
 ('love', 1205),
 ('im', 1148),
 ('u', 1137),
 ('time', 1085),
 ('like', 973),
 ('today', 939),
 ('new', 917),
 ('get', 915),
 ('cant', 808),
 ('people', 803),
 ('good', 790),
 ('one', 774),
 ('see', 753),
 ('dont', 729),
 ('life', 712),
 ('go', 649),
 ('want', 647),
 ('take', 615)]

## Lemmatization

In [19]:
#lemmatize processed_data and convert from list of words to sentence again
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
lemmatized_output = []

for lst in processed_data:
    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in lst])
    lemmatized_output.append(lemmed)

#convert list to df before pickling
# lemmatized_output = pd.DataFrame(lemmatized_output)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
print(lemmatized_output)



In [21]:
#pickle these for modeling
with open(path+'data/lemmatized_data.pkl', 'wb') as f:
    pickle.dump(lemmatized_output, f)

train_target.to_pickle(path+'data/train_target.pkl')
