In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import nltk

In [3]:
np.version.version

'1.20.2'

In [5]:
from gensim.corpora import Dictionary

AttributeError: partially initialized module 'smart_open' has no attribute 'local_file' (most likely due to a circular import)

In [100]:
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

### Load Dataset

In [101]:
data=pd.read_csv('datasets/tweets.csv')

In [102]:
#get stopword list
sw_list=stopwords.words('english')

#Since it might affect final outcome, a few words are removed from stopwords list
sw_list.remove('not')
sw_list.remove('no')
sw_list.remove('against')

In [103]:
#Initialize lemmatizer
lemmatizer=WordNetLemmatizer()

In [104]:
def nltk_wn_tag(nltk_tag):
  if nltk_tag.startswith('J'):
    return wordnet.ADJ
  elif nltk_tag.startswith('V'):
    return wordnet.VERB
  elif nltk_tag.startswith('N'):
    return wordnet.NOUN
  elif nltk_tag.startswith('R'):
    return wordnet.ADV
  else:                    
    return None

In [105]:
def lemmatize_sentence(sentence):
    tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    wordnet_tagged = map(lambda x: (x[0], nltk_wn_tag(x[1])), tagged)
    lemmatized_words=[]
    for word, tag in wordnet_tagged:
        if word not in sw_list:
            if tag is None:                        
                lemmatized_words.append(word)
            else:
                lemmatized_words.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_words)
    

In [106]:
def clean_tweets(tweet):
    #remove RT(retweet tag)
    tweet=re.sub(r"RT @","@",tweet)
    #remove pings
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    #remove URLs
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    #remove special characters
    tweet = re.sub(r"[^a-zA-Z]", ' ', tweet)
    #convert to lower case
    tweet=tweet.lower()
    #remove extra white spaces
    tweet = re.sub(r" +",' ', tweet)
    #lemmatization of tweet and removing stopwords
    lemmatized_tweet=lemmatize_sentence(tweet)
    #remove extra white spaces
    tweet = re.sub(r" +", ' ', tweet)
    return lemmatized_tweet.strip()

In [107]:
data['tweets']=[clean_tweets(tweet) for tweet in data['tweets']]

In [108]:
data['tweets']

0       obama call gop budget social darwinism nice tr...
1              teen year obama know use marijuana cocaine
2       ipa congratulates president barack obama leade...
3       whatsromneyhiding connection supporter critica...
4       obama approve targeted assassination modern us...
                              ...                        
1359    trend idiot look tweet lol make fun obama stup...
1360             kimkardashiansnextboyfriend barack obama
1361    gas obama take office guess promise would chan...
1362    haha know im smart mean get ta listen obama cu...
1363    obama dictator training pass training course n...
Name: tweets, Length: 1364, dtype: object

In [109]:
from sklearn.model_selection import train_test_split
X=data['tweets']
y=data['labels']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=7)

### Create corpus

In [111]:
corpus=[]
for tweet in X_train:
    for word in tweet.split(" "):
        if word not in corpus:
            corpus.append(word)

In [120]:
len(corpus)

2486

### Create a tfidf vector for 

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer()
tfidf_vectorizer.fit(X_train)

TfidfVectorizer()

In [117]:
dictionary=dict(zip(tfidf_vectorizer.get_feature_names(),tfidf_vectorizer.idf_))

In [119]:
len(dictionary)

2469