# General Feature Extraction

## Data Loader

In [297]:
import pandas as pd
import numpy as np
import re

In [298]:
url = 'https://github.com/laxmimerit/All-CSV-ML-Data-Files-Download/raw/master/twitter4000.csv'
df = pd.read_csv(url)

In [299]:
df.head()

Unnamed: 0,tweets,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


In [300]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweets     4000 non-null   object
 1   sentiment  4000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 62.6+ KB


In [301]:
df.isnull().sum()

tweets       0
sentiment    0
dtype: int64

In [302]:
df['sentiment'].value_counts()

sentiment
0    2000
1    2000
Name: count, dtype: int64

## Characters Count

In [303]:
text = 'this is a   simple text'
len(text.replace(' ', ''))

17

In [304]:
# using the regex to remove the spaces
pattern = r'\s' # it say's that the space characters
re.sub(pattern, '', text)


'thisisasimpletext'

In [305]:
## lets count the chars in our tweet

df['char_counts'] = df['tweets'].apply(lambda x: len(re.sub(pattern, '', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts
3884,up for an early run on this beautiful saturday...,1,109
1247,processing claims.....and doing monthly report...,0,63
1245,is at church waiting for friends to show up .....,0,53
3184,"@RogueOne Why, thank you",1,21
1303,Trying this hair thing. If it doesnt work... w...,0,46


## Word Counts

In [306]:
df['word_counts'] = df['tweets'].apply(lambda x: len(x.split()))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts
2083,The sun is shining and it's very hot today,1,34,9
1045,@Brieannnn i wish you were on againnn i was c...,0,43,10
1738,Oh my... No friends here guess i'll hit the ...,0,53,12
60,@TwilightofDoom NICE!!!!! I havent seen that m...,0,54,9
3743,Pip-pip-doodly-do laddy,1,22,2


## Average Word Length

In [307]:
df['avg_word_len'] = df['char_counts']/df['word_counts']

df['avg_word_len']=df['avg_word_len'].apply(lambda x: round(x, 2))

df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len
0,is bored and wants to watch a movie any sugge...,0,43,10,4.3
1,back in miami. waiting to unboard ship,0,32,7,4.57
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,54,12,4.5
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5
4,@mandagoforth me bad! It's funny though. Zacha...,0,116,26,4.46


## Stop Words Count

In [308]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as sw

In [309]:
len(sw) # so here in the spacy total 326 stop words are listed

326

In [310]:
x = 'This is an example text data for counting the stop words'
print([word for word in x.lower().split() if word in sw]) # these are the stopwords
len([word for word in x.lower().split() if word in sw])

['this', 'is', 'an', 'for', 'the']


5

In [311]:
df['stop_words_len'] = df['tweets'].apply(lambda x: len([word for word in x.lower().split() if word in sw]))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len
3655,"Throat hurts, getting some ice cream! Movie ni...",1,59,12,4.92,3
2234,Sometimes those things are best especially wit...,1,55,9,6.11,4
2495,Anybody want to chat it up? Hit me up on aim a...,1,64,23,2.78,10
1764,My Holidays are so monotonous widout my gang !...,0,61,15,4.07,6
491,RSVP only for the virgin megastore open bar. T...,0,54,11,4.91,4


## Count #Hash tags and @mentions

In [312]:
df['hashtag_count'] = df['tweets'].apply(lambda x: len(re.findall(r'#\w+', x)))
df['mentions_count'] = df['tweets'].apply(lambda x: len(re.findall(r'@\w+', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count
1097,Too tired to smokeee,0,17,4,4.25,2,0,0
1526,"@Katestar Oh yes, extremely fun when whoever y...",0,91,20,4.55,7,0,1
2822,Posting a tweet through sonnys new palm pre,1,36,8,4.5,2,0,0
186,catching up on some tv. i miss dvr,0,27,8,3.38,4,0,0
1598,is sad to be going back to Utah tomorrow,0,32,9,3.56,5,0,0


## Numeric digits in tweets

In [313]:
x = 'I want coupon code for the product ABC2345RT and XYZ43256YT. i need 20 pcs of all the items.'
re.findall(r'\b\d+\b', x)  # this is tells you to find the only numeric digits

['20']

In [314]:
df['numeric_counts'] = df['tweets'].apply(lambda x: len(re.findall(r'\b\d+\b', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts
77,@CatsHateMangos Ah rubbish. I leave for his on...,0,43,9,4.78,4,0,1,0
2915,spending the day at the pool,1,23,6,3.83,3,0,0,0
586,"@dawnmay7 Amen, I love house. All of my fav s...",0,67,15,4.47,5,0,1,0
2713,A perfect sunday for MindEmptyness,1,30,5,6.0,2,0,0,0
2331,are getting ready to go to church,1,27,7,3.86,4,0,0,0


In [315]:
df[df['numeric_counts']>0].head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts
13,Padres come back from being down 6-0 &amp; we ...,0,82,18,4.56,8,0,0,2
16,@BrianQuest I made 1 fo u 2: http://bit.ly/eId...,0,81,19,4.26,6,0,1,2
22,Back niggly again today (boo) so couldn't trai...,0,81,18,4.5,6,0,0,1
25,Eeeeep! New Moon is only 172 days away... Actu...,0,54,12,4.5,4,0,0,1
53,@Gen215 ROFL following Jesus! Found 1 th othr ...,0,109,29,3.76,11,0,1,3


## Count the UPPER case

In [316]:
x = 'HERE I am writing the code for CHECKING THE UPPER case words are present in the tweets data or not'

[word for word in x.split() if word.isupper()], len([word for word in x.split() if word.isupper()])

(['HERE', 'I', 'CHECKING', 'THE', 'UPPER'], 5)

In [317]:
df['upper_counts'] = df['tweets'].apply(lambda x: len([word for word in x.split() if word.isupper()]))
df.sample(4)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts
2772,@kewiki @musiccityace Think we should open &qu...,1,123,25,4.92,12,0,2,0,0
2583,eating watermelon.,1,17,2,8.5,0,0,0,0,0
763,http://twitpic.com/7qkel - they look like they...,0,76,13,5.85,4,0,0,0,0
140,"@_tranquilize ill keep you up, i have to stay ...",0,55,14,3.93,8,0,1,0,0


In [318]:
df[df['upper_counts']>3].iloc[0]['tweets']

"@jsong77  NOT DONE IT'S ALMOST 2 AM  this youtbe better start acting normaal"

# Preprocessing and Cleaning

## Lower Case Conversion

In [319]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(r'\s+', ' ', x))

In [320]:
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,32,7,4.57,3,0,0,0,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,54,12,4.5,1,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5,3,0,0,0,0
4,@mandagoforth me bad! It's funny though. Zacha...,0,116,26,4.46,13,0,1,0,0


## Contraction to Expansion

In [321]:
import os
os.makedirs('data', exist_ok=True)

In [322]:
import json
contraction = json.load(open('data/contractions.json'))
# contraction

In [323]:
x = "don't mess with me. i'am done with this stuff."
x.split()

["don't", 'mess', 'with', 'me.', "i'am", 'done', 'with', 'this', 'stuff.']

In [324]:
[contraction.get(word.lower(), word) for word in x.split()]

['do not', 'mess', 'with', 'me.', "i'am", 'done', 'with', 'this', 'stuff.']

In [325]:
" ".join([contraction.get(word.lower(), word) for word in x.split()])

"do not mess with me. i'am done with this stuff."

In [326]:
df['tweets'] = df['tweets'].apply(lambda x: " ".join([contraction.get(word.lower(), word) for word in x.split()]))

In [327]:
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0
1,back in miami. waiting to unboard ship,0,32,7,4.57,3,0,0,0,0
2,@misskpey awwww dnt this bring back at keyboar...,0,54,12,4.5,1,0,1,0,1
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5,3,0,0,0,0
4,@mandagoforth me bad! it is funny though. Zach...,0,116,26,4.46,13,0,1,0,0


## Count and Remove Emails

In [328]:
x = 'contact me at udemy@kgptalie.com and info123IMP@kgptalkie.co.in and also find me on the 34_clg@gmail.ac.in'

pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z.-]+\.[A-Z|a-z]{2,}\b'
result = re.findall(pattern=pattern, string=x)
print(result)

['udemy@kgptalie.com', 'info123IMP@kgptalkie.co.in', '34_clg@gmail.ac.in']


In [329]:
df['emails'] = df['tweets'].apply(lambda x: ",".join(re.findall(pattern=pattern, string=x)))

In [330]:
df['emails'].value_counts()

emails
                               3999
markbradbury_16@hotmail.com       1
Name: count, dtype: int64

In [331]:
df['email_count'] = df['emails'].apply(lambda x: x.count(',')+1 if len(x)>0 else 0)

In [332]:
df['email_count'].value_counts()

email_count
0    3999
1       1
Name: count, dtype: int64

In [333]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(pattern, '', x))
df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count
0,is bored and wants to watch a movie any sugges...,0,43,10,4.3,5,0,0,0,0,,0
1,back in miami. waiting to unboard ship,0,32,7,4.57,3,0,0,0,0,,0
2,@misskpey awwww dnt this bring back at keyboar...,0,54,12,4.5,1,0,1,0,1,,0
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5,3,0,0,0,0,,0
4,@mandagoforth me bad! it is funny though. Zach...,0,116,26,4.46,13,0,1,0,0,,0


In [334]:
df['tweets']

0       is bored and wants to watch a movie any sugges...
1                  back in miami. waiting to unboard ship
2       @misskpey awwww dnt this bring back at keyboar...
3                        ughhh i am so tired blahhhhhhhhh
4       @mandagoforth me bad! it is funny though. Zach...
                              ...                        
3995                                     i just graduated
3996              Templating works; it all has to be done
3997                      mommy just brought me starbucks
3998    @omarepps watching you on a House re-run...lov...
3999    Thanks for trying to make me smile i will make...
Name: tweets, Length: 4000, dtype: object

## Count and remove URLs

In [335]:
x = 'check this link: https://udityanarayantiwari.netlify.app and www.codefusioninhindi.com and also https://github.com/udityamerit'
pattern = r'http\S+|www\.\S+'
re.findall(pattern, x)

['https://udityanarayantiwari.netlify.app',
 'www.codefusioninhindi.com',
 'https://github.com/udityamerit']

In [336]:
df['urls'] = df['tweets'].apply(lambda x: re.findall(pattern, x))

In [337]:
df['urls_count'] = df['urls'].apply(lambda x: len(x))

In [338]:
df[df['urls_count']>0].sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count
99,@BlondeBlogger http://twitpic.com/4w8hk - i am...,0,62,10,6.2,4,0,1,0,0,,0,[http://twitpic.com/4w8hk],1
2964,@mileycyrus http://twitpic.com/6shtr - please ...,1,101,14,7.21,5,0,1,0,4,,0,[http://twitpic.com/6shtr],1
2620,#eric in the clear! latest @VeronicaDLCruz twl...,1,95,15,6.33,3,1,2,0,0,,0,[http://www.tweet4eric.com/],1
507,http://twitrpix.com/bfh More testing i am afraid.,0,44,5,8.8,1,0,0,0,0,,0,[http://twitrpix.com/bfh],1
523,misses your jokes. http://plurk.com/p/12e40f,0,41,4,10.25,1,0,0,0,0,,0,[http://plurk.com/p/12e40f],1


In [339]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(pattern, '', x))
df[df['urls_count']>0].head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts,emails,email_count,urls,urls_count
16,@BrianQuest I made 1 fo you 2: I tried but it...,0,81,19,4.26,6,0,1,2,3,,0,[http://bit.ly/eId8A],1
98,Heading to work,0,37,4,9.25,1,0,0,0,0,,0,[http://twitpic.com/4eojz],1
99,@BlondeBlogger - i am so sad this is so blurry!,0,62,10,6.2,4,0,1,0,0,,0,[http://twitpic.com/4w8hk],1
144,I miss you â«,0,32,5,6.4,2,0,0,0,1,,0,[http://blip.fm/~8lc2f],1
183,Photo: miss germany,0,44,4,11.0,0,0,0,0,0,,0,[http://tumblr.com/xf825f012],1


## Remove RT(retweet)

In [None]:
pattern = r'\bRT @\w+' ## pattern for the RT

In [341]:
df['is_retweet'] = df['tweets'].apply(lambda x: bool(len(re.findall(pattern, x))))

In [None]:
df['is_retweet'].sum() ## no retweet data in our dataset

0

In [343]:
## let's assume if any retweet data is present then how to remove it

df['tweets'] = df['tweets'].apply(lambda x: re.sub(pattern, '', x))

## Remove HTML tags