# Text Data in scikit-learn

<a href="https://colab.research.google.com/github/thomasjpfan/ml-workshop-advanced/blob/master/notebooks/01-text-data.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open and Execute in Google Colaboratory"></a>

In [None]:
# Install dependencies for google colab
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    %pip install -r https://raw.githubusercontent.com/thomasjpfan/ml-workshop-advanced/master/requirements.txt

In [None]:
import sklearn
assert sklearn.__version__.startswith("1.0"), "Plese install scikit-learn 1.0"

In [None]:
import matplotlib.pyplot as plt

sklearn.set_config(display='diagram')
plt.style.use('ggplot')

In [None]:
from pathlib import Path
import urllib.request
import tarfile

data_path = Path("data")
review_polarity_path = data_path / "review_polarity.tar.gz"
extracted_path = Path("data") / "txt_sentoken"

def untar_review():
    if not data_path.exists():
        data_path.mkdir()
        
    if IN_COLAB and not review_polarity_path.exists():
        # Download data from github for google colab
        url = "https://github.com/thomasjpfan/ml-workshop-advanced/raw/master/notebooks/data/review_polarity.tar.gz"
        with urllib.request.urlopen(url) as f:
            review_polarity_path.write_bytes(f.read())
            
        # Download data from github for google colab
        
    if extracted_path.exists():
        print("review_pairty_tar dataset already extracted")
        return
    with tarfile.open(review_polarity_path, "r") as tar_f:
        text_members = [m for m in tar_f.getmembers()
                       if m.name.startswith("txt_sentoken")]
        tar_f.extractall(path=data_path, members=text_members)
            
# This may take some time to run since it will download and extracted
untar_review()

## CountVectorizer

In [None]:
sample_text = ["Can we go to the hill? I finished my homework.",
               "The hill is very tall. Please be careful"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vect = CountVectorizer()
vect.fit(sample_text)

vect.get_feature_names_out()

In [None]:
X = vect.transform(sample_text)
X

In [None]:
X.toarray()

### Bag of words

In [None]:
sample_text

In [None]:
X_inverse = vect.inverse_transform(X)

In [None]:
X_inverse[0]

In [None]:
X_inverse[1]

## Loading text data with scikit-learn

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files(extracted_path, categories=["neg", "pos"])
raw_text_train, raw_y_train = reviews_train.data, reviews_train.target

In [None]:
import numpy as np
np.unique(raw_y_train)

In [None]:
np.bincount(raw_y_train)

In [None]:
len(raw_text_train)

In [None]:
raw_text_train[2]

## Split dataset

In [None]:
from sklearn.model_selection import train_test_split

text_train, text_test, y_train, y_test = train_test_split(
    raw_text_train, raw_y_train, stratify=raw_y_train, random_state=0)

### Transform training data

In [None]:
vect = CountVectorizer()
X_train = vect.fit_transform(text_train)

In [None]:
len(text_train)

In [None]:
X_train

### Transform testing set

In [None]:
len(text_test)

In [None]:
X_test = vect.transform(text_test)

In [None]:
X_test

### Extract feature names

In [None]:
feature_names = vect.get_feature_names_out()

In [None]:
feature_names[10000:10020]

In [None]:
feature_names[::3000]

### Linear model for classification

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear', random_state=42).fit(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
def plot_important_features(coef, feature_names, top_n=20, rotation=40):
    fig, ax = plt.subplots(figsize=(18, 6))
    coef = coef.reshape(-1)
    inds = np.argsort(coef)
    low = inds[:top_n]
    high = inds[-top_n:]
    important = np.hstack([low, high])
    my_range = range(len(important))
    colors = ['red'] * top_n + ['blue'] * top_n
    
    ax.bar(my_range, coef[important], color=colors)
    ax.set_xticks(my_range)
    ax.set_xticklabels(feature_names[important], rotation=rotation, ha="right", fontsize=16)
    ax.set_xlim(-.7, 2 * top_n)
    ax.set_frame_on(False)

In [None]:
feature_names = vect.get_feature_names_out()

In [None]:
plot_important_features(lr.coef_, feature_names, top_n=15)

## Exercise 1

1. Train a `RandomForestClassifier` with `max_depth=3` on the training set, `X_train` and `y_train`.
2. Evalute the accuracy on the test set.
3. What are the top 20 important features accourind go `feature_importances_` of the random forst.
    - **Hint**: Use `argsort` and array slicing.

In [None]:
from sklearn.ensemble import RandomForestClassifier

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan//ml-workshop-advanced/blob/master/notebooks/solutions/01-ex01-solutions.py).

In [None]:
# %load solutions/01-ex01-solutions.py

## CountVectorizer Options

In [None]:
sample_text = ["Can we go to the hill? I finished my homework.",
               "The hill is very tall. Please be careful"]

In [None]:
vect = CountVectorizer()
vect.fit(sample_text)
vect.get_feature_names_out()

### Stop words

In [None]:
vect = CountVectorizer(stop_words='english')
vect.fit(sample_text)
vect.get_feature_names_out()

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print(list(ENGLISH_STOP_WORDS))

### Max features

In [None]:
vect = CountVectorizer(max_features=4, stop_words='english')
vect.fit(sample_text)
vect.get_feature_names_out()

### Min frequency on the review dataset

With `min_df=1` (default)

In [None]:
X_train.shape

With `min_df=2`

In [None]:
vect = CountVectorizer(min_df=2)
X_train_min_df_2 = vect.fit_transform(text_train)

In [None]:
X_train_min_df_2.shape

In [None]:
lr_df_2 = LogisticRegression(solver='liblinear').fit(X_train_min_df_2, y_train)

In [None]:
X_test_min_df_2 = vect.transform(text_test)

#### Scores with different min frequencies

In [None]:
lr_df_2.score(X_test_min_df_2, y_test)

In [None]:
lr.score(X_test, y_test)

## Pipelines and Vectorizers

In [None]:
from sklearn.pipeline import Pipeline

log_reg = Pipeline([
    ('vectorizer', CountVectorizer(min_df=2)),
    ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
])

In [None]:
log_reg

In [None]:
text_train[2]

In [None]:
log_reg.fit(text_train, y_train)

In [None]:
log_reg.score(text_test, y_test)

## Exercise 2

1. Create a pipeline with a `CountVectorizer` with `min_df=2` and `stop_words='english'` and a `RandomForestClassifier` with `max_depth=3`.
2. What is the score of the random forest on the test dataset?

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan//ml-workshop-advanced/blob/master/notebooks/solutions/01-ex02-solutions.py).

In [None]:
# %load solutions/01-ex02-solutions.py

## Bigrams

`CountVectorizer` takes a `ngram_range` parameter

In [None]:
sample_text

In [None]:
cv = CountVectorizer(ngram_range=(1, 1)).fit(sample_text)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:", cv.get_feature_names_out())

In [None]:
cv = CountVectorizer(ngram_range=(2, 2)).fit(sample_text)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:")
print(cv.get_feature_names_out())

In [None]:
cv = CountVectorizer(ngram_range=(1, 2)).fit(sample_text)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:")
print(cv.get_feature_names_out())

## n-grams with stop words

In [None]:
cv_n_gram = CountVectorizer(ngram_range=(1, 2), min_df=2, stop_words="english")
cv_n_gram.fit(text_train)

In [None]:
len(cv_n_gram.vocabulary_)

In [None]:
print(cv_n_gram.get_feature_names_out()[::2000])

In [None]:
pipe_cv_n_gram = Pipeline([
    ('vectorizer', cv_n_gram),
    ('classifier', LogisticRegression(solver='liblinear'))
])

In [None]:
pipe_cv_n_gram.fit(text_train, y_train)

In [None]:
pipe_cv_n_gram.score(text_test, y_test)

In [None]:
feature_names = pipe_cv_n_gram[:-1].get_feature_names_out()

In [None]:
classifier_coefs = pipe_cv_n_gram['classifier'].coef_.ravel()

In [None]:
plot_important_features(classifier_coefs, feature_names)

### Only look at 2-grams 

In [None]:
is_two_gram = np.asarray([len(word.split(" ")) == 2 for word in feature_names], dtype=bool)

In [None]:
plot_important_features(classifier_coefs[is_two_gram], feature_names[is_two_gram])

## Tf-idf rescaling

In [None]:
sample_text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidvect = TfidfVectorizer().fit(sample_text)
tfid_trans = tfidvect.transform(sample_text)

In [None]:
tfid_trans.toarray()

## Train on the review dataset

In [None]:
rf_tfid = Pipeline([
   ('vectorizer', TfidfVectorizer(ngram_range=(1, 2), min_df=4,
                                  stop_words="english")),
   ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
rf_tfid.fit(text_train, y_train)

In [None]:
rf_tfid.score(text_test, y_test)

## Exercise 3

1. How many samples are there in the training dataset and test dataset?
1. Construct a pipeline with a `TfidfVectorizer` and `LogisticRegression`.
1. Evalute the pipeline on the test set.
1. Plot the feature importances using `plot_important_features`.

In [None]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'comp.sys.mac.hardware',
    'comp.os.ms-windows.misc',
]
remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               remove=remove)

text_train, y_train = data_train.data, data_train.target
text_test, y_test = data_test.data, data_test.target

**If you are running locally**, you can uncomment the following cell to load the solution into the cell. On **Google Colab**, [see solution here](https://github.com/thomasjpfan//ml-workshop-advanced/blob/master/notebooks/solutions/01-ex03-solutions.py).

In [None]:
# %load solutions/01-ex03-solutions.py