https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/2909405083353266/2622605937013888/6410281937148337/latest.html

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np

In [5]:
body = [
    "the quick brown fox",
    "the slow brown dog",
    "the quick red dog",
    "the lazy yellow fox"
]

# Document-Term Matrix
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(body)
bag_of_words.todense()

matrix([[1, 0, 1, 0, 1, 0, 0, 1, 0],
        [1, 1, 0, 0, 0, 0, 1, 1, 0],
        [0, 1, 0, 0, 1, 1, 0, 1, 0],
        [0, 0, 1, 1, 0, 0, 0, 1, 1]])

In [13]:
vectorizer.get_feature_names_out()

array(['brown', 'dog', 'fox', 'lazy', 'quick', 'red', 'slow', 'the',
       'yellow'], dtype=object)

In [9]:
# SVD

svd = TruncatedSVD(n_components=2)
lsa = svd.fit_transform(bag_of_words)
lsa

array([[ 1.69490493,  0.29952405],
       [ 1.51585111, -0.76911037],
       [ 1.51585111, -0.76911037],
       [ 1.26618606,  1.44058513]])

In [14]:
# generated topics
svd.components_

array([[ 0.3539373 ,  0.33419932,  0.3264155 ,  0.13957787,  0.3539373 ,
         0.16709966,  0.16709966,  0.66061483,  0.13957787],
       [-0.14025617, -0.4594362 ,  0.5197363 ,  0.43027437, -0.14025617,
        -0.2297181 , -0.2297181 ,  0.0603001 ,  0.43027437]])

In [16]:
encoding_matrix = pd.DataFrame(svd.components_,
                               index=['topic_1', 'topic_2'],
                               columns=vectorizer.get_feature_names_out()).T
encoding_matrix

Unnamed: 0,topic_1,topic_2
brown,0.353937,-0.140256
dog,0.334199,-0.459436
fox,0.326416,0.519736
lazy,0.139578,0.430274
quick,0.353937,-0.140256
red,0.1671,-0.229718
slow,0.1671,-0.229718
the,0.660615,0.0603
yellow,0.139578,0.430274


In [18]:
# What are the top words for each topic? What dimensions in word-space explain most of the variance in the data?
# get the absolute values and sort
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
encoding_matrix.sort_values('abs_topic_1', ascending=False)

Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
the,0.660615,0.0603,0.660615,0.0603
brown,0.353937,-0.140256,0.353937,0.140256
quick,0.353937,-0.140256,0.353937,0.140256
dog,0.334199,-0.459436,0.334199,0.459436
fox,0.326416,0.519736,0.326416,0.519736
slow,0.1671,-0.229718,0.1671,0.229718
red,0.1671,-0.229718,0.1671,0.229718
lazy,0.139578,0.430274,0.139578,0.430274
yellow,0.139578,0.430274,0.139578,0.430274


In [19]:
encoding_matrix.sort_values('abs_topic_2', ascending=False)

Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
fox,0.326416,0.519736,0.326416,0.519736
dog,0.334199,-0.459436,0.334199,0.459436
lazy,0.139578,0.430274,0.139578,0.430274
yellow,0.139578,0.430274,0.139578,0.430274
red,0.1671,-0.229718,0.1671,0.229718
slow,0.1671,-0.229718,0.1671,0.229718
quick,0.353937,-0.140256,0.353937,0.140256
brown,0.353937,-0.140256,0.353937,0.140256
the,0.660615,0.0603,0.660615,0.0603


In [10]:
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["body"] = body
topic_encoded_df.head()

Unnamed: 0,topic_1,topic_2,body
0,1.694905,0.299524,the quick brown fox
1,1.515851,-0.76911,the slow brown dog
2,1.515851,-0.76911,the quick red dog
3,1.266186,1.440585,the lazy yellow fox


# TFIDF (frequency–inverse document frequency)

min_df signifies the number of documents in which a term must appear in order for it to be counted

In [25]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
bag_of_words = vectorizer.fit_transform(body)
bag_of_words

<4x8 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [26]:
svd = TruncatedSVD(n_components=2)
lsa = svd.fit_transform(bag_of_words)

In [27]:
encoding_matrix = pd.DataFrame(svd.components_,
                               index=['topic_1', 'topic_2'],
                               columns=vectorizer.get_feature_names_out()).T
encoding_matrix

Unnamed: 0,topic_1,topic_2
brown,0.493342,-0.050949
dog,0.429123,-0.360157
fox,0.382525,0.519657
lazy,0.131587,0.495335
quick,0.493342,-0.050949
red,0.272144,-0.228406
slow,0.272144,-0.228406
yellow,0.131587,0.495335


In [28]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
encoding_matrix.sort_values('abs_topic_1', ascending=False)

Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
brown,0.493342,-0.050949,0.493342,0.050949
quick,0.493342,-0.050949,0.493342,0.050949
dog,0.429123,-0.360157,0.429123,0.360157
fox,0.382525,0.519657,0.382525,0.519657
slow,0.272144,-0.228406,0.272144,0.228406
red,0.272144,-0.228406,0.272144,0.228406
lazy,0.131587,0.495335,0.131587,0.495335
yellow,0.131587,0.495335,0.131587,0.495335


In [29]:
encoding_matrix.sort_values('abs_topic_2', ascending=False)

Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
fox,0.382525,0.519657,0.382525,0.519657
yellow,0.131587,0.495335,0.131587,0.495335
lazy,0.131587,0.495335,0.131587,0.495335
dog,0.429123,-0.360157,0.429123,0.360157
slow,0.272144,-0.228406,0.272144,0.228406
red,0.272144,-0.228406,0.272144,0.228406
brown,0.493342,-0.050949,0.493342,0.050949
quick,0.493342,-0.050949,0.493342,0.050949


# Example 2

In [31]:
body = ["In a medium bowl, combine the orange juice, sugar, melted butter, egg, and orange zest.Transfer the batter to the prepared pan. Bake at 350°F for 50 to 55 minutes or until done (a tester inserted into the center comes out clean. Cool in the pan for 10 minutes. Remove from pan and cool on a wire rack.",
        "The company shares now trade for less than 13 times expected 2023 earnings, despite a five-year average ratio of 35.5. Between 2015 and 2023, PayPal's lowest price-to-earnings (P/E) ratio was 20.3.Applying that conservative multiple to its average expected 2023 earnings of $4.55 yields a price of $92.36 per share by early 2024, implying 58.6% upside from its Nov. 15 close at $58.25 per share being interesting for investors",
        "The company has faced several high-profile disputes this year; Florida Governor Ron DeSantis has been on a crusade against the company for publicly opposing anti-LGBTQ bills, and Charter Communications Inc. (CHTR) recently settled a dispute with Disney over carrier fees for its ESPN channel so investors should take care",
        "Melt the butter in a 10-inch Dutch oven or other heavy, deep pot over medium heat. Add the crushed crackers, black pepper, and kosher salt and stir to coat with the melted butter. Continue to toast over medium heat, stirring often, until golden brown, 2 to 4 minutes.",
        "Heat the oven to 180C/160C fan/gas 4. Oil and line the base of two 18cm sandwich tins. Sieve the flour, cocoa powder and bicarbonate of soda into a bowl. Add the caster sugar and mix well and bake for 40 minutes",
        "The company growth is certainly decelerating this year 2023 – the red-hot energy market hasn't skyrocketed as it did in an inflation but investors shouldn't forget the value of an inflation hedge in their portfolios. A 3% dividend yield and an impressively low payout ratio of around 24% give some credibility with income investors as well."
        "Melt the butter in a 300ml mug in the microwave for 20-30 seconds on high. Tip in the brandy, raisins, sultanas, mixed peel and glacé cherries and bake for a further 45 minutes. Remove from the microwave and tip into a jug or bowl to cool",
        "Pour the mixture into the two tins and bake for 25-30 mins until risen and firm to the touch. Remove from oven, leave to cool for 10 mins before turning out to cool.",
        "The company is a travel and accommodation services provider. Its shares were under a bit of pressure in November 2023 after Evercore ISI downgraded the shares to In Line from Outperform but kept a $136 share price target for the firm, citing a weaker risk/reward outlook for investors.",
        "The company is a software company that sells products to businesses and individuals. It marks a strong start to our list of the best stocks to buy according to Paul Tudor since the firm's shares are rated Strong Buy on average and analysts have set an average share price target of $616.",
        "It is an Israeli technology company developing self driving products and technologies. The firm posted strong third quarter financials in October 2023 that saw it beat analyst revenue and EPS estimates and increase its 2023 profit guidance as well.",
        "To make the icing, beat the unsalted butter in a bowl until soft and bake it. Gradually sieve and beat in the icing sugar and cocoa powder, then add enough of the milk to make the icing fluffy and spreadable.Let it cool.",
        "Leave some of the remaining butter in the mug and use the rest to butter a second 300ml mug. If you want to turn out the cakes at the end, line the mugs with baking parchment. It’s easiest to put a piece in the base of each mug, then another around the inside wall. You will get neater edges if you cover the inside fully.",
        "Add the remaining ingredients to the soaked fruit, mix to combine, then divide between the two mugs. Cook for 1 min 30 seconds until puffed up but not dry, then leave to stand for 2 mins. Eat out of the mug or turn out onto a plate. Serve warm with a spoonful of brandy butter or plenty of custard.",
        "Heat oven to 350°F. Roll-out the pie dough to a 13-inch diameter. Place on a rimmed baking sheet. Mound the pear and cranberry filling in the middle of the dough round, leaving a 2 to 3 inch border.Bake:Bake at 350°F for 1 hour, or until the crust is nicely browned.Cool on a rack for at least an hour before serving.",
        "AWS company has an annual revenue run rate of about $92.2 billion. Given cloud services rival Microsoft Corp. (MSFT) trades for about 12.6 times sales, putting the same multiple on AWS pegs its value at $1.16 trillion. At Amazon's roughly $1.47 trillion valuation, investors are getting the rest of the company's massive operations – which has trailing 12-month sales of more than $466 billion – for about $310 billion.",
        "Citigroup, a roughly $85 billion multinational company with both retail and investment banking arms. What Citigroup offers investors is twofold: First, it pays a healthy 4.6% forward dividend yield, which is a nice buffer for shareholders in an era of rising rates and high inflation."]

In [32]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
bag_of_words = vectorizer.fit_transform(body)
bag_of_words

<16x347 sparse matrix of type '<class 'numpy.float64'>'
	with 460 stored elements in Compressed Sparse Row format>

In [33]:
svd = TruncatedSVD(n_components=2)
lsa = svd.fit_transform(bag_of_words)

In [34]:
encoding_matrix = pd.DataFrame(svd.components_,
                               index=['topic_1', 'topic_2'],
                               columns=vectorizer.get_feature_names_out()).T
encoding_matrix

Unnamed: 0,topic_1,topic_2
10,0.124837,-0.067277
12,0.022631,0.052366
13,0.044586,0.021386
136,0.030595,0.068507
15,0.021939,0.047233
...,...,...
wire,0.044338,-0.027489
year,0.055397,0.068165
yield,0.046332,0.036472
yields,0.021939,0.047233


In [38]:
encoding_matrix['abs_topic_1'] = np.abs(encoding_matrix['topic_1'])
encoding_matrix['abs_topic_2'] = np.abs(encoding_matrix['topic_2'])
encoding_matrix.sort_values('abs_topic_1', ascending=False).head(10)

Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
cool,0.220273,-0.092789,0.220273,0.092789
butter,0.193367,-0.096412,0.193367,0.096412
bake,0.186107,-0.088558,0.186107,0.088558
mins,0.168011,-0.070154,0.168011,0.070154
minutes,0.160555,-0.078949,0.160555,0.078949
oven,0.144176,-0.083194,0.144176,0.083194
mug,0.144038,-0.045845,0.144038,0.045845
heat,0.135854,-0.096065,0.135854,0.096065
add,0.133919,-0.084695,0.133919,0.084695
pan,0.133015,-0.082467,0.133015,0.082467


In [37]:
encoding_matrix.sort_values('abs_topic_2', ascending=False).head(10)

Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
company,0.118405,0.239325,0.118405,0.239325
2023,0.131921,0.236276,0.131921,0.236276
average,0.07855,0.201238,0.07855,0.201238
shares,0.082825,0.196803,0.082825,0.196803
price,0.076081,0.180225,0.076081,0.180225
share,0.076081,0.180225,0.076081,0.180225
strong,0.063678,0.167587,0.063678,0.167587
earnings,0.065818,0.141699,0.065818,0.141699
buy,0.046318,0.13661,0.046318,0.13661
investors,0.099894,0.13466,0.099894,0.13466
