<a href="https://colab.research.google.com/github/wjdolan/DS_Portfolio/blob/main/Text_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Feature Extraction for text Data**

## BoW

In [3]:
# list of documents
docs = ["SUPERB, I AM IN LOVE IN THIS PHONE", "I hate this phone"]

words = list(set([word for doc in docs for word in doc.lower().split()]))

vectors = []
for doc in docs:
    vectors.append([1 if word in doc.lower().split() else 0 for word in words])

print("vocabulary: ", words)   
print("vectors: ", vectors)


vocabulary:  ['i', 'phone', 'superb,', 'this', 'love', 'in', 'hate', 'am']
vectors:  [[1, 1, 1, 1, 1, 1, 0, 1], [1, 1, 0, 1, 0, 0, 1, 0]]


## CountVectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# create the transform
vectorizer = CountVectorizer()

# tokenize and build vocab
vectorizer.fit(docs)
print('vocabulary: ', vectorizer.vocabulary_)

# encode document
vector = vectorizer.transform(docs)

# summarize encoded vector
print('shape: ', vector.shape)
print('vectors: ', vector.toarray())

vocabulary:  {'superb': 5, 'am': 0, 'in': 2, 'love': 3, 'this': 6, 'phone': 4, 'hate': 1}
shape:  (2, 7)
vectors:  [[1 0 2 1 1 1 1]
 [0 1 0 0 1 0 1]]


## TF-IDF

https://miro.medium.com/max/392/1*q6WuO-BoZFIFR9o8cgajRQ.png

tf * (idf + 1) = tf + tf * idf”

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(docs)

# summarize
print('vocabulary: ', vectorizer.vocabulary_)
print('idfs: ', vectorizer.idf_)

# encode document
vector = vectorizer.transform([docs[0]])

# summarize encoded vector
print('vectors: ', vector.toarray())

vocabulary:  {'superb': 5, 'am': 0, 'in': 2, 'love': 3, 'this': 6, 'phone': 4, 'hate': 1}
idfs:  [1.40546511 1.40546511 1.40546511 1.40546511 1.         1.40546511
 1.        ]
vectors:  [[0.35327777 0.         0.70655553 0.35327777 0.25136004 0.35327777
  0.25136004]]
