1. Textblob Translation

In [4]:
from textblob import TextBlob

In [7]:
def translate(text, from_l, to_l):
    en_blob = TextBlob(text)
    return en_blob.translate(from_lang=from_l, to=to_l)

translate(text='Yo estoy muy bien', from_l='es', to_l='en')

TextBlob("I'm fine")

2. Feature Extraction!

a. General and Specific Features

General features do not depend on the context or inherent meaning of the text, like word counts.
Specific features on the other hand, do.

In [8]:
#Extract general features from a text
import pandas as pd

In [9]:
df = pd.DataFrame([['The interim budget for 2019 will be announced on 1st February.'],
                   ['Do you know how much expectation the middle-class working population is having from this budget.'],
                   ['February is the shortest month in a year'],
                   ['This financial year will end on 31st March.']])

df.columns = ['text']
df.head()

Unnamed: 0,text
0,The interim budget for 2019 will be announced ...
1,Do you know how much expectation the middle-cl...
2,February is the shortest month in a year
3,This financial year will end on 31st March.


In [10]:
#Use apply() to iterate through the df rows and convert them to textblobs, extracting the words from them.
def add_num_words(df):
    df['number_of_words'] = df['text'].apply(lambda x : len(TextBlob(str(x)).words))
    return df

add_num_words(df)['number_of_words']

0    11
1    15
2     8
3     8
Name: number_of_words, dtype: int64

In [11]:
df

Unnamed: 0,text,number_of_words
0,The interim budget for 2019 will be announced ...,11
1,Do you know how much expectation the middle-cl...,15
2,February is the shortest month in a year,8
3,This financial year will end on 31st March.,8


In [15]:
def is_present(wh_words, df):
    df['is_wh_words_present'] = df['text'].apply(lambda x : True if len(set(TextBlob(str(x)).words).intersection(wh_words)) > 0 else False)
    return df

wh_words = set(['why', 'who', 'which', 'what', 'where', 'when', 'how'])

is_present(wh_words, df)['is_wh_words_present']

0    False
1     True
2    False
3    False
Name: is_wh_words_present, dtype: bool

In [22]:
df

Unnamed: 0,text,number_of_words,is_wh_words_present
0,The interim budget for 2019 will be announced ...,11,False
1,Do you know how much expectation the middle-cl...,15,True
2,February is the shortest month in a year,8,False
3,This financial year will end on 31st March.,8,False


In [26]:
#Instead of manually coding this, let's use Bag of Words (BOW) to extract features
from sklearn.feature_extraction.text import CountVectorizer

def vectorize_text(corpus):
    """
    Will return a dataframe in which every row will be a vector representation
    of a document in a corpus
    
    ::param corpus : input text corpus
    :return: dataframe of vectors    
    """
    
    bag_of_words_model = CountVectorizer()
    
    """
    performs the following:
    1. tokenizes the collection of documents referred to as a corpus
    2. builds vocab of unique words
    3. converts a document into vectors using the vocab built in step 2
    """
    
    dense_vec_matrix = bag_of_words_model.fit_transform(corpus).todense()
    bag_of_word_df = pd.DataFrame(dense_vec_matrix)
    bag_of_word_df.columns = sorted(bag_of_words_model.vocabulary_)
    
    return bag_of_word_df

In [27]:
corpus = ['Data Science is an overlap between the Arts and Sciences',
          'Generally, Arts graduates are right-brained and Science graduates are left-brained',
          'Excelling in both Arts and Science at a time become difficult',
          'Natural Language Processing is part of Data Science']

df = vectorize_text(corpus)
df.head()

Unnamed: 0,an,and,are,arts,at,become,between,both,brained,data,...,natural,of,overlap,part,processing,right,science,sciences,the,time
0,1,1,0,1,0,0,1,0,0,1,...,0,0,1,0,0,0,1,1,1,0
1,0,1,2,1,0,0,0,0,2,0,...,0,0,0,0,0,1,1,0,0,0
2,0,1,0,1,1,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,1,...,1,1,0,1,1,0,1,0,0,0


In [None]:
#Create a BOW model for the top 10 most frequent terms
def bow_top_n(corpus, n):
    """
    Will return a dataframe in which every row will be represented by presence or absence of top n most
    freqeuntly occuring words in a corpus
    
    ::param corpus : input text corpus
    :return: dataframe of vectors    
    """
    
    bag_of_words_model = CountVectorizer()
    
    """
    performs the following:
    1. tokenizes the collection of documents referred to as a corpus
    2. builds vocab of unique words
    3. converts a document into vectors using the vocab built in step 2
    """
    
    dense_vec_matrix = bag_of_words_model.fit_transform(corpus).todense()
    bag_of_word_df = pd.DataFrame(dense_vec_matrix)
    bag_of_word_df.columns = sorted(bag_of_words_model.vocabulary_)
    
    return bag_of_word_df