# Basic Text Features

In [2]:
# sample text string
text = "Dark matter is one of the greatest enigmas of astrophysics and cosmology"

We will split the string into individual words or tokens. This is also known as __tokenization__.

In [6]:
# split words of the text
text.split().ipynb_checkpoints/

['Dark',
 'matter',
 'is',
 'one',
 'of',
 'the',
 'greatest',
 'enigmas',
 'of',
 'astrophysics',
 'and',
 'cosmology']

In [8]:
# store the individual words in a variable
words = text.split()
words

['Dark',
 'matter',
 'is',
 'one',
 'of',
 'the',
 'greatest',
 'enigmas',
 'of',
 'astrophysics',
 'and',
 'cosmology']

### 1. Number of Words

In [9]:
# word count
len(words)

12

### 2. Number of Spaces

In [14]:
# spaces count
text.count(' ')

11

### 3. Number of Characters

In [15]:
# character count
len(text)

72

Even the spaces have been included.

In [16]:
# character count (excluding spaces)
len(text)-text.count(' ')

61

So, the text string has 61 characters excluding spaces.

### 4. Average Word Length

In [17]:
# empty list for
word_lengths = []

for i in text.split():
    word_lengths.append(len(i))
    
print(word_lengths)

[4, 6, 2, 3, 2, 3, 8, 7, 2, 12, 3, 9]


In [18]:
# average word length
sum(word_lengths)/len(word_lengths)

5.083333333333333

---

# Create Features for Twitter Dataset

Let's create the above mentioned features for a real-life dataset. 

In [19]:
import pandas as pd

In [20]:
tweets = pd.read_csv("tweets.csv")

Have a glimpse at the data.

In [21]:
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


This dataset has 3 features right now. 

1. __id:__ tweet id number, unique for every tweet
2. __label:__ 1 for negative tweet and 0 for positive or neutral tweet
3. __tweet:__ text data

We will create new features from the feature "tweet".


### 1. Word Count Feature

In [22]:
# number of words/terms in the tweets
tweets['word_count'] = [len(i.split()) for i in tweets['tweet']]

In [23]:
tweets.head()

Unnamed: 0,id,label,tweet,word_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17
2,3,0,We love this! Would you go? #talk #makememorie...,15
3,4,0,I'm wired I know I'm George I was made that wa...,17
4,5,1,What amazing service! Apple won't even talk to...,23


As you can see, we have a new feature __word_count__. Now let's create a feature of number of spaces in the tweets.

### 2. Space Count Feature

In [24]:
tweets['space_count'] = [i.count(' ') for i in tweets['tweet']]

In [25]:
tweets.head()

Unnamed: 0,id,label,tweet,word_count,space_count
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16
2,3,0,We love this! Would you go? #talk #makememorie...,15,14
3,4,0,I'm wired I know I'm George I was made that wa...,17,16
4,5,1,What amazing service! Apple won't even talk to...,23,22


### 3. Character Count Feature

In [26]:
tweets['character_count'] = [len(i) - i.count(' ') for i in tweets['tweet']]

In [30]:
tweets.head()

Unnamed: 0,id,label,tweet,word_count,space_count,character_count,average_word_length
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12,116,8.923077
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16,115,6.764706
2,3,0,We love this! Would you go? #talk #makememorie...,15,14,109,7.266667
3,4,0,I'm wired I know I'm George I was made that wa...,17,16,96,5.647059
4,5,1,What amazing service! Apple won't even talk to...,23,22,102,4.434783


### 4. Average Word Length Feature

In [28]:
avg_word_length = []

# nested for loop
for i in tweets['tweet']:
    word_lengths = []
    for j in i.split():
        # length of terms in a tweet
        word_lengths.append(len(j))
    
    # average word length of a tweet
    l = sum(word_lengths)/len(word_lengths)
    
    avg_word_length.append(l)

In [29]:
# create new feature 
tweets['average_word_length'] = avg_word_length

# Build Model

In [31]:
tweets.head()

Unnamed: 0,id,label,tweet,word_count,space_count,character_count,average_word_length
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,13,12,116,8.923077
1,2,0,Finally a transparant silicon case ^^ Thanks t...,17,16,115,6.764706
2,3,0,We love this! Would you go? #talk #makememorie...,15,14,109,7.266667
3,4,0,I'm wired I know I'm George I was made that wa...,17,16,96,5.647059
4,5,1,What amazing service! Apple won't even talk to...,23,22,102,4.434783


In [32]:
X = tweets[['word_count', 'space_count', 'character_count', 'average_word_length']]
y = tweets['label']

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler # for standardization

In [34]:
# split dataset into train and test set
xtrain, xtest, ytrain, ytest = train_test_split(StandardScaler().fit_transform(X), y, 
                                                test_size=0.33, random_state=42)

In [35]:
xtrain.shape, xtest.shape

((5306, 4), (2614, 4))

In [36]:
# fit model
lr = LogisticRegression()
lr.fit(xtrain, ytrain)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [37]:
# predict on test set
preds = lr.predict_proba(xtest)

In [38]:
preds

array([[0.92237069, 0.07762931],
       [0.59906325, 0.40093675],
       [0.95121387, 0.04878613],
       ...,
       [0.22827648, 0.77172352],
       [0.57339926, 0.42660074],
       [0.85059379, 0.14940621]])

In [39]:
roc_auc_score(ytest, preds[:,1])

0.8634915906355785