# Feature Engineering: Exercises

In [None]:
! git clone https://github.com/zoonek2/2024_Summer_Schools
! ln -s 2024_Summer_Schools/data .
! ln -s 2024_Summer_Schools/images .
! pip install palmerpenguins pydataset sentence_transformers timm umap-learn sentence_transformers
import nltk
nltk.download('punkt')

In [14]:
from collections import Counter
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import sklearn.datasets
from pydataset import data
from PIL import Image
from palmerpenguins import load_penguins

# Financial data

- Check that EPS and Price are almost proportional
- Compute the P/E ratio
- Plot the distribution of MCap and log(MCap)

In [None]:
d = pd.read_csv( "data/feature_analysis__financial_data.csv" )

In [None]:
## INSERT YOUR CODE HERE

# Images
Compute the "average colour" of the following images.
Can we use it to classify the images?

In [None]:
images = [ Image.open( f"data/tmp{j+1}.jpg" ) for j in range(3) ]
images = [ np.array( image.getdata() ) for image in images ]

In [None]:
averages = [
    ## INSERT YOUR CODE HERE
    for x in images
]

In [None]:
# You can plot the colours as follows
fig, ax = plt.subplots( figsize = (4,1) )
ax.scatter( [1,2,3], [0,0,0], color = [ u/255 for u in averages ], s = 2000 )
ax.set_xlim(0,4)
ax.axis('off')
plt.show()

# Text
Compute some features on the following texts.
For instance (the list is not exhaustive: add your own!)
- average length of the words
- average length of the sentences
- presence of numbers
- presence of all-caps words
- presence of past tense
- presence of future tense
- etc.

In [None]:
kantlipsum = """As any dedicated reader can clearly see, the Ideal of practical reason is a representation
of, as far as I know, the things in themselves; as I have shown elsewhere, the phenomena
should only be used as a canon for our understanding. The paralogisms of practical
reason are what first give rise to the architectonic of practical reason. As will easily be
shown in the next section, reason would thereby be made to contradict, in view of these
considerations, the Ideal of practical reason, yet the manifold depends on the phenomena.
Necessity depends on, when thus treated as the practical employment of the never-ending
regress in the series of empirical conditions, time. Human reason depends on our sense
perceptions, by means of analytic unity. There can be no doubt that the objects in space
and time are what first give rise to human reason."""

trump_tweet = """Time Magazine called to say that I was PROBABLY going to be named “Man (Person) of the Year,” like last year, but I would have to agree to an interview and a major photo shoot. I said probably is no good and took a pass. Thanks anyway!"""
trump2 = """So interesting to see “Progressive” Democrat Congresswomen, who originally came from countries whose governments are a complete and total catastrophe, the worst, most corrupt and inept anywhere in the world (if they even have a functioning government at all), now loudly......"""

financial_news = """Unprecedented demand for Nvidia's chips and data center services has fueled a new wave of growth for the company. With shares up over 220% in the last year, many investors probably think they've missed the boat."""

seuss = """
At the far end of town where the Grickle-grass grows and the wind smells slow-and-sour when it blows and no
birds ever sing excepting old crows... is the Street of the Lifted Lorax.
And deep in the Grickle-grass, some people say, if you look deep enough you can still see, today, where the
Lorax once stood just as long as it could before somebody lifted the Lorax away.
What was the Lorax? Any why was it there? And why was it lifted and taken somewhere from the far end of
town where the Grickle-grass grows? The old Once-ler still lives here.
Ask him, he knows.
"""

texts = [ kantlipsum, trump_tweet, trump2, financial_news, seuss ]

In [None]:
## INSERT YOUR CODE HERE

# Quantitative variables
In the following dataset, 
- Which variables would you transform with a min-max scaler?
- Which variables would you transform with a lograrithm?
- Are there pairs of variables you would add, subtract or divide?

In [None]:
housing = sklearn.datasets.fetch_openml(name="house_prices", as_frame=True)
print( housing['DESCR'] )

In [None]:
d = housing['data'].copy()
d[ housing['target_names'][0] ] = housing['target']
#d = d[ d.columns[ np.array( [ u in [int, float] for u in d.dtypes ] ) ] ]  # Only keep the numeric variables
d = d[['LotArea', 'TotRmsAbvGrd', '1stFlrSF', '2ndFlrSF', 'PoolArea', 'SalePrice' ]]
d.head(20)

In [None]:
## INSERT YOUR CODE HERE

# Missing values
In the following dataset: 
- Display the rows with missing values
- Replace the missing values with the average of the column (or the most common value, for qualitative variables)
- Can you think of another (simple, but more precise) way of imputing the missing dimensions?
- (*) Replace the missing values with a KNN forecast from the other columns
- (*) Compare the two approaches

In [None]:
d = load_penguins()
d.head()

In [None]:
## INSERT YOUR CODE HERE

# Data augmentation

- Augment the following dataset with the square and the cube of `x`
- (*) Forecast `y`, with a linear model, using just `x`
- (*) Forecast `y`, with a linear model, using `x`, `x²`, `x³`
- (*) Plot the data and the forecasts

In [None]:
n = 500
x = np.random.uniform(-1, 1, size=n) * math.pi
y = np.sin(x) + np.random.normal(size=n) * .2
d = pd.DataFrame( { 'x': x } )
d.head()

In [None]:
## INSERT YOUR CODE HERE

# Qualitative variables
What would you do with the qualitative variables?
- For which ones is ordinal-encoding meaningful?

In [None]:
housing = sklearn.datasets.fetch_openml(name="house_prices", as_frame=True)
#print( housing['DESCR'] )

In [None]:
d = housing['data'].copy()
d[ housing['target_names'][0] ] = housing['target']
d = d[['MSSubClass', 'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood', 'OverallQual', 'GarageQual']]
d.head(20)

In [None]:
## INSERT YOUR CODE HERE

# Dimension Reduction
The following dataset has 784 features.
- Reduce its dimension to 2 using PCA
- Reduce its dimension to 2 using UMAP
- Plot the results
- (*) Forecast `y` from the 2 features returned by UMAP
- (*) Forecast `y` from the 2 features returned by PCA
- (*) Compare those models
- (*) Forecast `y` from the first `m` principal components, for `m` ranging from 1 to 10, with a k-nearest neighbour model
- (*) How does the performance of the model change with `m`?

In [None]:
from sklearn.datasets import fetch_openml
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
X

In [None]:
## INSERT YOUR CODE HERE

# Time series
We want to forecast future values of the following time series. 
- Would you make any transformation of the numeric data?
- Add a few lagged values as additional features
- Add a few rolling means as additional features
- Add the month and the day of the year as additional features

In [None]:
x = data('AirPassengers')
#x = pd.Series( x.iloc[:,1].values.astype(float), index = x.iloc[:,0] )
year = np.floor( x['time'] ).astype(int)
month = ( ( 12 * x['time'] + 1e-6 ) % 12 + 1 ).astype(int)
date = [ f"{u}-{v:02d}-01" for u,v in zip( year, month ) ]
date = pd.to_datetime(date) + datetime.timedelta(35)
date = [ str(u)[:7] + "-01" for u in date ]
date = pd.to_datetime( date ) - datetime.timedelta(1)
d = pd.DataFrame( {
    'date': date,
    'AirPassengers': x.iloc[:,1].values.astype(float),
} )
d.tail()

In [None]:
## INSERT YOUR CODE HERE

# Text: term-document matrix
- What are the most common words in those texts? Try to remove words common in all of them.
- Compute the TF-IDF term-document matrix
- Reduce its dimension; plot the result
- (*) Same question with the *paragraphs* in those texts

In [None]:
!wget -nc https://www.gutenberg.org/cache/epub/64317/pg64317.txt
!wget -nc https://www.gutenberg.org/cache/epub/11/pg11.txt
!wget -nc https://www.gutenberg.org/cache/epub/1513/pg1513.txt
!wget -nc https://www.gutenberg.org/cache/epub/98/pg98.txt
!wget -nc https://www.gutenberg.org/cache/epub/67098/pg67098.txt

titles = {
    64317: "The Great Gatsby",
    11:    "Alice's Adventures in Wonderland",
    1513:  "Romeo and Juliet",
    98:    "A tale of two cities",
    67098: "Winnie the Pooh",
}

texts = {}
for id, title in titles.items():
    f = open( f"pg{id}.txt", "r" )
    texts[title] = f.read()
    f.close()

def remove_legalese(text):
    text = text.split("\n")
    i1 = np.argwhere( np.array( [ u.startswith( '*** START' ) for u in text ] ) )[0,0]
    i2 = np.argwhere( np.array( [ u.startswith( '*** END' ) for u in text ] ) )[0,0]
    text = text[i1+1:i2]
    text = '\n'.join(text)
    return text
    
texts = { title: remove_legalese(text) for title, text in texts.items() }

In [None]:
## INSERT YOUR CODE HERE

# Constant features
In the following dataset:
- Are there constant features?
- Are there nearly constant features?
- Are there qualitative features with no repeated values?

In [None]:
housing = sklearn.datasets.fetch_openml(name="house_prices", as_frame=True)
#print( housing['DESCR'] )
d = housing['data'].copy()
d[ housing['target_names'][0] ] = housing['target']
d = d[['Id', 'LotShape', 'Utilities', 'OverallQual', 'CentralAir', 'PoolQC']]
d.head(20)

In [None]:
## INSERT YOUR CODE HERE

# Redundant features
In the following dataset: 
- What are the 2 features with the largest correlation?

In [None]:
housing = sklearn.datasets.fetch_openml(name="house_prices", as_frame=True)
#print( housing['DESCR'] )
d = housing['data'].copy()
d[ housing['target_names'][0] ] = housing['target']
i = np.array([ u in [int, float] for u in d.dtypes ])
d = d.iloc[:,i]

In [None]:
i = np.array([ u in [int, float] for u in d.dtypes ])
d = d.iloc[:,i]

In [None]:
## INSERT YOUR CODE HERE

# Useful features
- What are the 12 predictors most correlated with the target variable “SalePrice”?- 
Plot a scatterplot of SalePrice against each of those variable
- (*) Fit linear models with increasingly more of those variables


In [None]:
housing = sklearn.datasets.fetch_openml(name="house_prices", as_frame=True)
#print( housing['DESCR'] )
d = housing['data'].copy()
d[ housing['target_names'][0] ] = housing['target']
i = np.array([ u in [int, float] for u in d.dtypes ])
d = d.iloc[:,i]
d

In [None]:
## INSERT YOUR CODE HERE

# Image embeddings (*)

The following code computes a 4096-dimensional embedding of the *.jpg files in the `images` directory.
- Reduce the dimension to 2 and plot the data. You can use a scatter plot, or use the images themselves (with `plt.imshow`, which has an `extent` argument).
- Can you think of a simple model to distinguish cat pictures from dog pictures (not one of the models you saw yesterday, something even simpler)?

In [None]:
from urllib.request import urlopen
from PIL import Image
import timm

# Model to compute image embeddings
model = timm.create_model( 'vgg19.tv_in1k', pretrained=True, num_classes=0 )
model = model.eval()
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(**data_config, is_training=False)

# List of images (feel free to add more, or use your own)
duplicates = set( [ 'tmp3.jpg', 'tmp2.jpg', 'tmp1.jpg', 'i07.jpg', 'text1.jpg' ] )
files = os.listdir("images")
files = [ f for f in files if f not in duplicates ]
files = [ f for f in files if f.endswith(".jpg") ]
files = [ f"images/{f}" for f in files ]
np.random.shuffle( files )
embeddings = {}

def read_image(file): 
    """
    Copilot (DALL-E) does not allow me to choose the aspect ratio.
    It used to generate only square images, but it now only produces 7×4 images.
    Crop the images, if needed.
    """
    img = Image.open(file)
    w, h = img.size
    assert w >= h, f"I expect the images to have a square or landscape aspect ratio: {file} has a portrait aspect ratio, {w}×{h}"
    a = ( w - h ) // 2
    img = img.crop( (a, 0, a+h, h) )
    return img    

# Compute the embeddings
for file in tqdm(files): 
    img = read_image(file)
    output = model(transforms(img).unsqueeze(0))
    embeddings[ file ] = output.detach().numpy().flatten()
embeddings = pd.DataFrame( embeddings ).T    

In [None]:
## INSERT YOUR CODE HERE

# Word embeddings (*)

Here are vector embeddings of 400,000 words. 
- Reduce their dimension to 2 and plot the resulting cloud of points
- Use the embeddings to find 100 names of animals (hint: pick an animal, and take the 100 closest words)
- Perform the dimension reduction on those 100 words alone, and plot the corresponding cloud of points.

In [3]:
import gensim.downloader
#vectors = gensim.downloader.load('glove-twitter-25')
vectors = gensim.downloader.load('glove-wiki-gigaword-300')  # SLOW

In [None]:
## INSERT YOUR CODE HERE

# Sentence embeddings (*) **MAY NOT WORK ON GOOGLE COLAB**
Here are embeddings of sentences from a few novels.
- Reduce the dimension of the data to 2, and plot the corresponding cloud of points
- Are the sentences in those novels homogeneous, or do you see clusters for some of them? Can you interpret those clusters?

In [19]:
import nltk
from sentence_transformers import SentenceTransformer, util
model = "all-mpnet-base-v2"
model = SentenceTransformer(model)

ids = {
    'The Great Gatsby':                 64317,
    "Alice's Adventures in Wonderland":    11,
    'Romeo and Juliet':                  1513,
    'A tale of two cities':                98,
    'Winnie the Pooh':                  67098,
}

def remove_legalese(text):
    text = text.split("\n")
    i1 = np.argwhere( np.array( [ u.startswith( '*** START' ) for u in text ] ) )[0,0]
    i2 = np.argwhere( np.array( [ u.startswith( '*** END' ) for u in text ] ) )[0,0]
    text = text[i1+1:i2]
    text = '\n'.join(text)
    return text

raw_texts = {}
for title, id in ids.items():
    with open( f"data/pg{id}.txt" ) as f:
        raw_texts[ title ] = f.read()       

texts = { title: remove_legalese( text ) for title, text in raw_texts.items() }
sentences = { 
    title: nltk.sent_tokenize( text )
    for title, text in texts.items() 
}

In [20]:
embeddings = { title: model.encode( novel ) for title, novel in sentences.items() }  # 3 minutes locally, much more (30?) on Google Colab
embeddings = { title: pd.DataFrame( e, index = sentences[title] ) for title, e in embeddings.items() }

In [None]:
## INSERT YOUR CODE HERE