# General Feature Extraction

## Data Loader

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
url = 'https://github.com/laxmimerit/All-CSV-ML-Data-Files-Download/raw/master/twitter4000.csv'
df = pd.read_csv(url)

In [3]:
df.head()

Unnamed: 0,tweets,sentiment
0,is bored and wants to watch a movie any sugge...,0
1,back in miami. waiting to unboard ship,0
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0
3,ughhh i am so tired blahhhhhhhhh,0
4,@mandagoforth me bad! It's funny though. Zacha...,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweets     4000 non-null   object
 1   sentiment  4000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 62.6+ KB


In [5]:
df.isnull().sum()

tweets       0
sentiment    0
dtype: int64

In [6]:
df['sentiment'].value_counts()

sentiment
0    2000
1    2000
Name: count, dtype: int64

## Characters Count

In [7]:
text = 'this is a   simple text'
len(text.replace(' ', ''))

17

In [8]:
# using the regex to remove the spaces
pattern = r'\s' # it say's that the space characters
re.sub(pattern, '', text)


'thisisasimpletext'

In [9]:
## lets count the chars in our tweet

df['char_counts'] = df['tweets'].apply(lambda x: len(re.sub(pattern, '', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts
377,@not_mikecoleman I am sorry I have only just ...,0,45
3146,School again tomorrow! Taking summer classes s...,1,94
2773,"@tehkseven @Cali_Breezy, Yea. I can confirm th...",1,97
2741,cleaning! getting ready for the fam to come in...,1,103
2203,@hourrafoot =&gt; estelle who?,1,27


## Word Counts

In [10]:
df['word_counts'] = df['tweets'].apply(lambda x: len(x.split()))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts
3520,Haters do your job you got me this far!!!! Tha...,1,66,15
1346,I'm watching Jon &amp; Kate and they're breaki...,0,48,10
3841,@ddlovato you should totally wish @takemeback ...,1,64,12
1141,@edythemighty sorry your dog is sick,0,31,6
552,Crying literally why did you run under my car...,0,66,16


## Average Word Length

In [11]:
df['avg_word_len'] = df['char_counts']/df['word_counts']

df['avg_word_len']=df['avg_word_len'].apply(lambda x: round(x, 2))

df.head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len
0,is bored and wants to watch a movie any sugge...,0,43,10,4.3
1,back in miami. waiting to unboard ship,0,32,7,4.57
2,"@misskpey awwww dnt dis brng bak memoriessss, ...",0,54,12,4.5
3,ughhh i am so tired blahhhhhhhhh,0,27,6,4.5
4,@mandagoforth me bad! It's funny though. Zacha...,0,116,26,4.46


## Stop Words Count

In [12]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as sw

In [None]:
len(sw) # so here in the spacy total 326 stop words are listed

326

In [None]:
x = 'This is an example text data for counting the stop words'
print([word for word in x.lower().split() if word in sw]) # these are the stopwords
len([word for word in x.lower().split() if word in sw])

['this', 'is', 'an', 'for', 'the']


5

In [17]:
df['stop_words_len'] = df['tweets'].apply(lambda x: len([word for word in x.lower().split() if word in sw]))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len
3101,@Paulpb I will always remember you are &quot;T...,1,116,28,4.14,15
2205,eating food leaving school to go to hospital ...,1,45,12,3.75,5
3894,@MerlinsKingdom Immortal: This should be good...,1,67,15,4.47,8
1566,@pastapadre that was my biggest gripe with the...,0,75,14,5.36,7
2731,"Well, nice of you to finally show up, Sexy McB...",1,43,10,4.3,4


## Count #Hash tags and @mentions

In [18]:
df['hashtag_count'] = df['tweets'].apply(lambda x: len(re.findall(r'#\w+', x)))
df['mentions_count'] = df['tweets'].apply(lambda x: len(re.findall(r'@\w+', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count
2511,read! read!! read!!!,1,18,3,6.0,0,0,0
212,MONDAY + FIRST OF THE MONTH = LONGGGG DAYYYYY.,0,38,9,4.22,3,0,0
2299,@eurini Ahh good. You'll like it,1,27,6,4.5,1,0,1
2577,#3hotwords - @britneyspears' &quot;phonography...,1,49,4,12.25,0,1,1
1990,I FEEL SO..... REPLACED.,0,21,4,5.25,1,0,0


## Numeric digits in tweets

In [None]:
x = 'I want coupon code for the product ABC2345RT and XYZ43256YT. i need 20 pcs of all the items.'
re.findall(r'\b\d+\b', x)  # this is tells you to find the only numeric digits

['20']

In [22]:
df['numeric_counts'] = df['tweets'].apply(lambda x: len(re.findall(r'\b\d+\b', x)))
df.sample(5)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts
1138,"parents still get Bmore Sun delivered, saw it ...",0,85,21,4.05,9,0,0,0
1726,SO depressed cause our dorm is showing MILK We...,0,77,19,4.05,10,0,0,0
2772,@kewiki @musiccityace Think we should open &qu...,1,123,25,4.92,12,0,2,0
3113,@ArmyWifeyDebbie Army folk travel so much that...,1,120,25,4.8,14,0,1,0
3261,@sazmows started the new job yet? Hope it rock...,1,99,19,5.21,7,0,1,0


In [27]:
df[df['numeric_counts']>0].head()

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts
13,Padres come back from being down 6-0 &amp; we ...,0,82,18,4.56,8,0,0,2
16,@BrianQuest I made 1 fo u 2: http://bit.ly/eId...,0,81,19,4.26,6,0,1,2
22,Back niggly again today (boo) so couldn't trai...,0,81,18,4.5,6,0,0,1
25,Eeeeep! New Moon is only 172 days away... Actu...,0,54,12,4.5,4,0,0,1
53,@Gen215 ROFL following Jesus! Found 1 th othr ...,0,109,29,3.76,11,0,1,3


## Count the UPPER case

In [28]:
x = 'HERE I am writing the code for CHECKING THE UPPER case words are present in the tweets data or not'

[word for word in x.split() if word.isupper()], len([word for word in x.split() if word.isupper()])

(['HERE', 'I', 'CHECKING', 'THE', 'UPPER'], 5)

In [33]:
df['upper_counts'] = df['tweets'].apply(lambda x: len([word for word in x.split() if word.isupper()]))
df.sample(4)

Unnamed: 0,tweets,sentiment,char_counts,word_counts,avg_word_len,stop_words_len,hashtag_count,mentions_count,numeric_counts,upper_counts
1410,I think I might be getting sick.,0,26,7,3.71,4,0,0,0,2
2933,@vene2ia Thanks! I am doing good... Did you r...,1,57,14,4.07,8,0,1,0,1
523,misses your jokes. http://plurk.com/p/12e40f,0,41,4,10.25,1,0,0,0,0
2784,@realadulttalk come on and smile for me? That'...,1,62,12,5.17,5,0,1,0,1


In [35]:
df[df['upper_counts']>3].iloc[0]['tweets']

"@jsong77  NOT DONE IT'S ALMOST 2 AM  this youtbe better start acting normaal"

# Preprocessing and Cleaning

## Lower Case Conversion