## Feature Extraction

### Part 1: Bag-of-Word Approach

In [None]:
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk_stop_words=stopwords.words('english')

In [None]:
#good guy example from the slides.
inputs="I am a good guy. She is a good lady. I and she are good people. He is also a good guy."
tokenized=word_tokenize(inputs)
tokenized=[w.lower() for w in tokenized]
tokenized=[w for w in tokenized if w not in nltk_stop_words and w.isalpha()]
print ("token list:",tokenized)

In [None]:
count=Counter(tokenized)
print ("token & frequency:",count)

In [None]:
print ("most common tokens:",count.most_common(2))

In [None]:
#Let's use PD format.
import pandas as pd

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#generate CountVectorizer object.
coun_vect = CountVectorizer(
        
)

#Some parameters you can adjust inside.
#lowercase
#stop_words
#min_df
#max_df
#ngram_range
#and etc

#For the entire list of all the parameters:
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [None]:
input_v2=pd.Series(["I am a good guy. She is a good lady.",
              "I and she are good people.",
              "He is also a good guy."  
              ])
print (input_v2)
print (input_v2.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#generate CountVectorizer object.
coun_vect = CountVectorizer(lowercase=True,
stop_words='english'
)

#convert strings to numerical vectors.
count_matrix = coun_vect.fit_transform(input_v2)
print (count_matrix)


#CountVetorizer automatically removes single character words, apply lower-casing.

In [None]:
#Identify the unique words.
print("Unique Vocabulary: ", coun_vect.vocabulary_)
print (len(coun_vect.vocabulary_))

#show 2D array matrix.
count_array = count_matrix.toarray()

print(count_array)
print (count_array.shape)

In [None]:
# Convert 2D array matrix into Pandas DataFrame.
bow_df = pd.DataFrame(count_array)

# Map the column names to the corresponding words. 
bow_df.columns = coun_vect.get_feature_names()

print(bow_df)

### Part2: TF-IDF Approach

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#generate TfidfVectorizer object.
tfidf=TfidfVectorizer(
)

#For more parameter tunings:
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
example_list=["The apartment was clean and room was big.",
              "The place was very clean and in a great neighborhood!",
              "we stayed at a clean place "
              ]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
tf_vec = TfidfVectorizer(
lowercase=True,
stop_words='english',
use_idf=True)

# Generate matrix of word vectors
tfidf_matrix = tf_vec.fit_transform(example_list)

#Identify the unique words.
print("Unique Vocabulary: ", tf_vec.vocabulary_)
print (len(tf_vec.vocabulary_))

#show 2D array matrix.
tf_array = tfidf_matrix.toarray()

print(tf_array)
print (tf_array.shape)

In [None]:
# Convert 2D array matrix into Pandas DataFrame.
tf_df = pd.DataFrame(tf_array)

# Map the column names to the corresponding words. 
tf_df.columns = tf_vec.get_feature_names()

print(tf_df)

In [None]:
#How to interpret the outputs:

#so, tf-idf value for 1th word in 0th doc is 0.546.

#clean and place should be less weighted than the words that appear fewer times.

## Part 3: Cosine Similarity

In [None]:
import numpy as np
from numpy.linalg import norm
 
# two np array
A = np.array([1,3,6])
B = np.array([-2,-4,6])
 
print("A:", A)
print("B:", B)
 
# compute cosine similarity
cos = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cos)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

A=[1,3,6]
B=[-2,-4,6]

# compute cosine similarity
cos=cosine_similarity([A],[B])
print (cos)

In [None]:
A = np.array([[2,1,2],[3,2,9], [-1,2,-3]])
B = np.array([3,4,2])
print("A:\n", A)
print("B:\n", B)
 
# compute cosine similarity
cos = np.dot(A,B)/(norm(A, axis=1)*norm(B))
print("Cosine Similarity:\n", cos)

In [None]:
A=[[2,1,2],[3,2,9], [-1,2,-3]]
B=[3,4,2]

print (A[0])
print (B)

for i in range(len(A)):
    cos=cosine_similarity([A[i]],[B])
    print (cos)
    

In [None]:
input_cos=["I am a good guy. She is a good lady.",
              "I and she are good people.",
              "He is also a good guy.",
                     "I and she like a movie."
              ]

In [None]:
#generate CountVectorizer object.
coun_vect = CountVectorizer(
)

#convert strings to numerical vectors.
count_matrix = coun_vect.fit_transform(input_cos)

#Identify the unique words.
print("Unique Vocabulary: ", coun_vect.vocabulary_)
print (len(coun_vect.vocabulary_))

#show 2D array matrix.
count_array = count_matrix.toarray()

print(count_array)
print (count_array.shape)

In [None]:
def cos(A,B):
    cos = np.dot(A,B)/(norm(A)*norm(B))
    return cos

print (cos(count_array[0],count_array[2]))
print ()
print (cos(count_array[0],count_array[3]))

In [None]:
print (cosine_similarity(count_array[[0]],count_array[[2]]))
print (cosine_similarity(count_array[[0]],count_array[[3]]))

In [None]:
print (cosine_similarity(count_array,count_array))