# General Feature Extraction

## Data Loader

In [223]:
import pandas as pd
import numpy as np
import re

In [224]:
url = 'https://github.com/laxmimerit/All-CSV-ML-Data-Files-Download/raw/master/twitter4000.csv'
df = pd.read_csv(url)

In [225]:
df.head()

Unnamed: 0,tweets,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


In [226]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweets     4000 non-null   object
 1   sentiment  4000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 62.6+ KB


In [227]:
df.isnull().sum()

tweets       0
sentiment    0
dtype: int64

In [228]:
df['sentiment'].value_counts()

sentiment
0    2000
1    2000
Name: count, dtype: int64

## Characters Count

In [229]:
text = 'this is a   simple text'
len(text.replace(' ', ''))

17

In [230]:
# using the regex to remove the spaces
pattern = r'\s' # it say's that the space characters
re.sub(pattern, '', text)


'thisisasimpletext'

In [231]:
## lets count the chars in our tweet

df['char_counts'] = df['tweets'].apply(lambda x: len(re.sub(pattern, '', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts
3792,going to see hangover again with colby,1,32
1613,"watching disneey channel, myspace, aim and i'm...",0,79
2081,@Redwense hi mikey!,1,17
1448,Flu &amp; Sore throat......ain't fun,0,32
605,My scratched Halo 3 http://flic.kr/p/6yJRvo,0,39


## Word Counts

In [232]:
df['word_counts'] = df['tweets'].apply(lambda x: len(x.split()))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts
2199,"@Vaquino Ayy so bitter. It's alright though, I...",1,113,25
549,@Monica_777 Ooh ok. Heard it was amazing but ...,0,75,14
3207,@CollingsA Who said that cottage industry is d...,1,43,8
2654,@trvsbrkr Sorry man!! We are so happy about th...,1,52,11
2674,@NGowers cute kids Sometimes I wish mine were...,1,108,23


## Average Word Length

In [233]:
df['avg_word_len'] = df['char_counts']/df['word_counts']

df['avg_word_len']=df['avg_word_len'].apply(lambda x: round(x, 2))

df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len
0,is bored and wants to watch a movie any sugge...,0,43,10,4.3
1,back in miami. waiting to unboard ship,0,32,7,4.57
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,54,12,4.5
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5
4,@mandagoforth me bad! It's funny though. Zacha...,0,116,26,4.46


## Stop Words Count

In [234]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as sw

In [235]:
len(sw) # so here in the spacy total 326 stop words are listed

326

In [236]:
x = 'This is an example text data for counting the stop words'
print([word for word in x.lower().split() if word in sw]) # these are the stopwords
len([word for word in x.lower().split() if word in sw])

['this', 'is', 'an', 'for', 'the']


5

In [237]:
df['stop_words_len'] = df['tweets'].apply(lambda x: len([word for word in x.lower().split() if word in sw]))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len
791,Raining in Chicago=Sad day for the Magic.... T...,0,70,14,5.0,7
3122,is ready for some pampering,1,23,5,4.6,3
2774,@curtiswalker hahahah ok. but I am going to sl...,1,79,18,4.39,7
407,@AmyElizabeth26 I don't care where but I haven...,0,106,24,4.42,14
3094,@taracasper what are you doing here? I thought...,1,62,14,4.43,9


## Count #Hash tags and @mentions

In [238]:
df['hashtag_count'] = df['tweets'].apply(lambda x: len(re.findall(r'#\w+', x)))
df['mentions_count'] = df['tweets'].apply(lambda x: len(re.findall(r'@\w+', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count
1833,i want to go to the show tonight next weekend...,0,61,14,4.36,8,0,0
2034,thanks Louis for buying me my Mag &lt;3 x,1,33,9,3.67,3,0,0
3646,catie and kaila r comin over!,1,24,6,4.0,1,0,0
2636,@AshleyBuchweitz - There's only one charger le...,1,100,18,5.56,5,0,1
2954,@michaelsheen http://twitpic.com/4f2t1 - ooh s...,1,47,6,7.83,1,0,1


## Numeric digits in tweets

In [239]:
x = 'I want coupon code for the product ABC2345RT and XYZ43256YT. i need 20 pcs of all the items.'
re.findall(r'\b\d+\b', x)  # this is tells you to find the only numeric digits

['20']

In [240]:
df['numeric_counts'] = df['tweets'].apply(lambda x: len(re.findall(r'\b\d+\b', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts
240,"@itschristablack i know it would be awesome, i...",0,67,15,4.47,7,0,1,0
3185,@danieldraper looks like something I could was...,1,54,10,5.4,5,0,1,0
1118,ohh and missing all my brownheads,0,28,6,4.67,3,0,0,0
173,@likewhoaxox awwz i hate when that happens!! ...,0,84,18,4.67,6,0,1,0
2941,accidentally found my first ever real job payc...,1,60,9,6.67,3,0,0,0


In [241]:
df[df['numeric_counts']>0].head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts
13,Padres come back from being down 6-0 &amp; we ...,0,82,18,4.56,8,0,0,2
16,@BrianQuest I made 1 fo u 2: http://bit.ly/eId...,0,81,19,4.26,6,0,1,2
22,Back niggly again today (boo) so couldn't trai...,0,81,18,4.5,6,0,0,1
25,Eeeeep! New Moon is only 172 days away... Actu...,0,54,12,4.5,4,0,0,1
53,@Gen215 ROFL following Jesus! Found 1 th othr ...,0,109,29,3.76,11,0,1,3


## Count the UPPER case

In [242]:
x = 'HERE I am writing the code for CHECKING THE UPPER case words are present in the tweets data or not'

[word for word in x.split() if word.isupper()], len([word for word in x.split() if word.isupper()])

(['HERE', 'I', 'CHECKING', 'THE', 'UPPER'], 5)

In [243]:
df['upper_counts'] = df['tweets'].apply(lambda x: len([word for word in x.split() if word.isupper()]))
df.sample(4)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts
3060,"@miCh3LL3Ramir3z heh, those things don't usual...",1,72,14,5.14,4,0,1,0,1
3874,"@heidimontag It was wonderful, just home from ...",1,69,15,4.6,9,0,1,0,1
3102,"@jswching Well, perhaps because there aren't t...",1,111,21,5.29,11,0,1,2,0
3257,@trvsbrkr http://twitpic.com/6bft5 - ur much b...,1,78,17,4.59,5,0,1,0,0


In [244]:
df[df['upper_counts']>3].iloc[0]['tweets']

"@jsong77  NOT DONE IT'S ALMOST 2 AM  this youtbe better start acting normaal"

# Preprocessing and Cleaning

## Lower Case Conversion

In [245]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(r'\s+', ' ', x))

In [246]:
df['tweets'] = df['tweets'].apply(lambda x: x.lower())

In [247]:
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,32,7,4.57,3,0,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,54,12,4.5,1,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5,3,0,0,0,0
4,@mandagoforth me bad! it's funny though. zacha...,0,116,26,4.46,13,0,1,0,0


## Contraction to Expansion

In [248]:
import os
os.makedirs('data', exist_ok=True)

In [249]:
import json
contraction = json.load(open('data/contractions.json'))
# contraction

In [250]:
x = "don't mess with me. i'am done with this stuff."
x.split()

["don't", 'mess', 'with', 'me.', "i'am", 'done', 'with', 'this', 'stuff.']

In [251]:
[contraction.get(word.lower(), word) for word in x.split()]

['do not', 'mess', 'with', 'me.', "i'am", 'done', 'with', 'this', 'stuff.']

In [252]:
" ".join([contraction.get(word.lower(), word) for word in x.split()])

"do not mess with me. i'am done with this stuff."

In [253]:
df['tweets'] = df['tweets'].apply(lambda x: " ".join([contraction.get(word.lower(), word) for word in x.split()]))

In [254]:
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,32,7,4.57,3,0,0,0,0
2,@misskpey awwww dnt this bring back at keyboar...,0,54,12,4.5,1,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5,3,0,0,0,0
4,@mandagoforth me bad! it is funny though. zach...,0,116,26,4.46,13,0,1,0,0


## Count and Remove Emails

In [255]:
x = 'contact me at udemy@kgptalie.com and info123IMP@kgptalkie.co.in and also find me on the 34_clg@gmail.ac.in'

pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z.-]+\.[A-Z|a-z]{2,}\b'
result = re.findall(pattern=pattern, string=x)
print(result)

['udemy@kgptalie.com', 'info123IMP@kgptalkie.co.in', '34_clg@gmail.ac.in']


In [256]:
df['emails'] = df['tweets'].apply(lambda x: ",".join(re.findall(pattern=pattern, string=x)))

In [257]:
df['emails'].value_counts()

emails
                               3999
markbradbury_16@hotmail.com       1
Name: count, dtype: int64

In [258]:
df['email_count'] = df['emails'].apply(lambda x: x.count(',')+1 if len(x)>0 else 0)

In [259]:
df['email_count'].value_counts()

email_count
0    3999
1       1
Name: count, dtype: int64

In [260]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(pattern, '', x))
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0,,0
1,back in miami. waiting to unboard ship,0,32,7,4.57,3,0,0,0,0,,0
2,@misskpey awwww dnt this bring back at keyboar...,0,54,12,4.5,1,0,1,0,1,,0
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5,3,0,0,0,0,,0
4,@mandagoforth me bad! it is funny though. zach...,0,116,26,4.46,13,0,1,0,0,,0


In [261]:
df['tweets']

0       is bored and wants to watch a movie any sugges...
1                  back in miami. waiting to unboard ship
2       @misskpey awwww dnt this bring back at keyboar...
3                        ughhh i am so tired blahhhhhhhhh
4       @mandagoforth me bad! it is funny though. zach...
                              ...                        
3995                                     i just graduated
3996              templating works; it all has to be done
3997                      mommy just brought me starbucks
3998    @omarepps watching you on a house re-run...lov...
3999    thanks for trying to make me smile i will make...
Name: tweets, Length: 4000, dtype: object

## Count and remove URLs

In [262]:
x = 'check this link: https://udityanarayantiwari.netlify.app and www.codefusioninhindi.com and also https://github.com/udityamerit'
pattern = r'http\S+|www\.\S+'
re.findall(pattern, x)

['https://udityanarayantiwari.netlify.app',
 'www.codefusioninhindi.com',
 'https://github.com/udityamerit']

In [263]:
df['urls'] = df['tweets'].apply(lambda x: re.findall(pattern, x))

In [264]:
df['urls_count'] = df['urls'].apply(lambda x: len(x))

In [265]:
df[df['urls_count']>0].sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count
233,gays are not supposed to be fat. lol! http://t...,0,54,8,6.75,2,0,0,0,1,,0,[http://twitpic.com/6uvgi],1
2039,i think i found a de-stresser. pop the bubblew...,1,101,16,6.31,7,0,0,0,2,,0,[http://tinyurl.com/gpzf],1
2624,larping about http://lecturesovercoffee.blogsp...,1,73,3,24.33,1,0,0,2,0,,0,[http://lecturesovercoffee.blogspot.com/2009/0...,1
2719,http://twitpic.com/5eynj - office office. true...,1,50,6,8.33,0,0,0,0,0,,0,[http://twitpic.com/5eynj],1
2707,learn how to 'effectively promote your shop' u...,1,113,17,6.65,5,3,0,0,0,,0,[http://bit.ly/brmky],1


In [266]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(pattern, '', x))
df[df['urls_count']>0].head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count
16,@brianquest i made 1 fo you 2: i tried but it...,0,81,19,4.26,6,0,1,2,3,,0,[http://bit.ly/eid8a],1
98,heading to work,0,37,4,9.25,1,0,0,0,0,,0,[http://twitpic.com/4eojz],1
99,@blondeblogger - i am so sad this is so blurry!,0,62,10,6.2,4,0,1,0,0,,0,[http://twitpic.com/4w8hk],1
144,i miss you â«,0,32,5,6.4,2,0,0,0,1,,0,[http://blip.fm/~8lc2f],1
183,photo: miss germany,0,44,4,11.0,0,0,0,0,0,,0,[http://tumblr.com/xf825f012],1


## Remove RT(retweet)

In [267]:
pattern = r'\bRT @\w+' ## pattern for the RT

In [268]:
df['is_retweet'] = df['tweets'].apply(lambda x: bool(len(re.findall(pattern, x))))

In [269]:
df['is_retweet'].sum() ## no retweet data in our dataset

0

In [270]:
## let's assume if any retweet data is present then how to remove it

df['tweets'] = df['tweets'].apply(lambda x: re.sub(pattern, '', x))

## Remove HTML tags

In [271]:
x = '<meta property="og:title" content="How to Become a successful machine learning engineer - KGP Talkie" />'

In [272]:
from bs4 import BeautifulSoup

In [273]:
# %pip install lxml

In [274]:
soup = BeautifulSoup(x, 'lxml')
soup

<html><head><meta content="How to Become a successful machine learning engineer - KGP Talkie" property="og:title"/></head></html>

In [275]:
soup.find('meta')['content']

'How to Become a successful machine learning engineer - KGP Talkie'

In [276]:
df['tweets']=df['tweets'].apply(lambda x: BeautifulSoup('<p>'+x+'<p>', 'lxml').get_text())

## Remove Accented Characters

In [277]:
import unicodedata

x = 'à, è, ì, ò, ù, À, È, Ì, Ò, Ù'

unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')

'a, e, i, o, u, A, E, I, O, U'

In [278]:
df['tweets']=df['tweets'].apply(lambda x:unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore') )

## Special Chars removal or punctuation removal

In [279]:
x = '@mandagoforth me bad! it is funny $though. #Zach'

pattern = r'@\w+'
re.findall(pattern, x)
x = re.sub(pattern, "", x).strip()
x

'me bad! it is funny $though. #Zach'

In [280]:
# [^A-Za-z0-9-] == [^\w\s]  it means detect all other than the words
pattern = r'[^\w\s]'
re.sub(pattern,'',x)


'me bad it is funny though Zach'

In [281]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(pattern, '', x).strip())

In [282]:
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count,is_retweet
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0,,0,[],0,False
1,back in miami waiting to unboard ship,0,32,7,4.57,3,0,0,0,0,,0,[],0,False
2,misskpey awwww dnt this bring back at keyboard...,0,54,12,4.5,1,0,1,0,1,,0,[],0,False
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5,3,0,0,0,0,,0,[],0,False
4,mandagoforth me bad it is funny though zachary...,0,116,26,4.46,13,0,1,0,0,,0,[],0,False


## Remove the Repeated Chars

In [283]:
x = 'i loooovvvvvveeeeee learninggggggg'
pattern = r'(.)\1+'
re.sub(pattern, r'\1\1', x)

'i loovvee learningg'

In [284]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(pattern, r'\1\1', x))
df.head()

## after that we are correcting the sentence using the spell correcting algo

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count,is_retweet
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0,,0,[],0,False
1,back in miami waiting to unboard ship,0,32,7,4.57,3,0,0,0,0,,0,[],0,False
2,misskpey aww dnt this bring back at keyboard m...,0,54,12,4.5,1,0,1,0,1,,0,[],0,False
3,ughh i am so tired blahh,0,27,6,4.5,3,0,0,0,0,,0,[],0,False
4,mandagoforth me bad it is funny though zachary...,0,116,26,4.46,13,0,1,0,0,,0,[],0,False


## Removal of Stop Words

In [285]:
nlp = spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words
x = 'this is a sample sentence with the stop words. this that and so many things..'
" ".join([word for word in x.split() if word not in stop_words])

'sample sentence stop words. things..'

In [286]:
df['tweets_no_stop'] = df['tweets'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [287]:
df.head(2)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count,is_retweet,tweets_no_stop
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0,,0,[],0,False,bored wants watch movie suggestions
1,back in miami waiting to unboard ship,0,32,7,4.57,3,0,0,0,0,,0,[],0,False,miami waiting unboard ship


## Convert into base or root form of word

In [288]:
import spacy
nlp = spacy.load('en_core_web_sm')
x = 'The dogs and cats are playing. Children are going to school'
doc = nlp(x)

for token in doc:
        if token.pos_ in ['NOUN', 'VERB']:
                print(token.pos_, token.lemma_)

NOUN dog
NOUN cat
VERB play
NOUN child
VERB go
NOUN school


In [289]:
x = 'The dogs and cats are playing. Children are going to school'


def lemmatize_noun_verb(x):
        doc = nlp(x)
        tokens = []
        for token in doc:
                if token.pos_ in ['NOUN', 'VERB']:
                        tokens.append(token.lemma_)
                else:
                        tokens.append(token.text)
        x = ' '.join(tokens)
        pattern = r'\s\.'
        x = re.sub(pattern, '.', x)
        return x

lemmatize_noun_verb(x)

'The dog and cat are play. child are go to school'

In [290]:
# df['tweets'].apply(lambda x: lemmatize_noun_verb(x))
df['base_tweets'] = df['tweets'].apply(lemmatize_noun_verb)

# both are the same 

In [291]:
df[df['base_tweets'] != df['tweets']].iloc[1]

tweets            back in miami waiting to unboard ship
sentiment                                             0
char_counts                                          32
word_counts                                           7
avg_word_len                                       4.57
stop_words_len                                        3
hashtag_count                                         0
mentions_count                                        0
numeric_counts                                        0
upper_counts                                          0
emails                                                 
email_count                                           0
urls                                                 []
urls_count                                            0
is_retweet                                        False
tweets_no_stop               miami waiting unboard ship
base_tweets          back in miami wait to unboard ship
Name: 1, dtype: object

## Common words removal

In [292]:
x = ' '.join(df['tweets_no_stop'].tolist())
words = x.split()

In [296]:
# words

In [300]:
from collections import Counter
word_freq = Counter(words)
top10 = [word[0] for word in word_freq.most_common(10)]
print(top10)

['day', 'good', 'today', 'like', 'love', 'got', 'work', 'going', 'time', 'know']


In [302]:
df['no_common_tweets'] = df['tweets'].apply(lambda x: ' '.join([word for word in x.split() if word not in top10]))

In [303]:
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count,is_retweet,tweets_no_stop,base_tweets,no_common_tweets
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0,,0,[],0,False,bored wants watch movie suggestions,is bored and want to watch a movie any suggestion,is bored and wants to watch a movie any sugges...
1,back in miami waiting to unboard ship,0,32,7,4.57,3,0,0,0,0,,0,[],0,False,miami waiting unboard ship,back in miami wait to unboard ship,back in miami waiting to unboard ship
2,misskpey aww dnt this bring back at keyboard m...,0,54,12,4.5,1,0,1,0,1,,0,[],0,False,misskpey aww dnt bring keyboard memoriess thnk...,misskpey aww dnt this bring back at keyboard m...,misskpey aww dnt this bring back at keyboard m...
3,ughh i am so tired blahh,0,27,6,4.5,3,0,0,0,0,,0,[],0,False,ughh tired blahh,ughh i am so tired blahh,ughh i am so tired blahh
4,mandagoforth me bad it is funny though zachary...,0,116,26,4.46,13,0,1,0,0,,0,[],0,False,mandagoforth bad funny zachary quinto reply sy...,mandagoforth me bad it is funny though zachary...,mandagoforth me bad it is funny though zachary...


## Rare words removal

In [304]:
least10 = [word[0] for word in word_freq.most_common()[-11:]]
print(least10)

['imjstsayin', 'littlefletcher', 'imstardust', 'heyhey', 'colin', 'saravananr', 'heycameron', 'thejetset', 'templating', 'omarepps', 'rerunlovin']


In [305]:
df['no_least_tweets'] = df['tweets'].apply(lambda x: ' '.join([word for word in x.split() if word not in least10]))

In [306]:
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count,is_retweet,tweets_no_stop,base_tweets,no_common_tweets,no_least_tweets
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0,,0,[],0,False,bored wants watch movie suggestions,is bored and want to watch a movie any suggestion,is bored and wants to watch a movie any sugges...,is bored and wants to watch a movie any sugges...
1,back in miami waiting to unboard ship,0,32,7,4.57,3,0,0,0,0,,0,[],0,False,miami waiting unboard ship,back in miami wait to unboard ship,back in miami waiting to unboard ship,back in miami waiting to unboard ship
2,misskpey aww dnt this bring back at keyboard m...,0,54,12,4.5,1,0,1,0,1,,0,[],0,False,misskpey aww dnt bring keyboard memoriess thnk...,misskpey aww dnt this bring back at keyboard m...,misskpey aww dnt this bring back at keyboard m...,misskpey aww dnt this bring back at keyboard m...
3,ughh i am so tired blahh,0,27,6,4.5,3,0,0,0,0,,0,[],0,False,ughh tired blahh,ughh i am so tired blahh,ughh i am so tired blahh,ughh i am so tired blahh
4,mandagoforth me bad it is funny though zachary...,0,116,26,4.46,13,0,1,0,0,,0,[],0,False,mandagoforth bad funny zachary quinto reply sy...,mandagoforth me bad it is funny though zachary...,mandagoforth me bad it is funny though zachary...,mandagoforth me bad it is funny though zachary...


## Word Cloud Visualization