In [1]:
import numpy as np


def stylistic_features(text):
    """
    Extracts stylistic features from a given text, including text length.

    Parameters:
    - text (str): The input text from which stylistic features are extracted.

    Returns:
    - dict: A dictionary containing the sentence count, average word length, and text length.
    """
    text_length = len(text)
    sentences = text.split(".")
    sentence_count = len(sentences) - 1
    word_lengths = [len(word) for word in text.split()]
    average_word_length = sum(word_lengths) / len(word_lengths) if word_lengths else 0

    return {
        "text_length": text_length,
        "sentence_count": sentence_count,
        "average_word_length": average_word_length,
    }


def stylistic_similarity(
    text1,
    text2,
    features_to_compare=None,
):
    """
    Calculates the stylistic similarity between two texts based on customizable features, including consideration of text length.

    Parameters:
    - text1 (str): The first text.
    - text2 (str): The second text.
    - features_to_compare (list, optional): A list of features to compare for similarity. Options include 'sentence_count', 'average_word_length', and 'text_length'. Defaults to comparing all features.

    Returns:
    - float: The overall stylistic similarity between the two texts based on the selected features.
    """
    if features_to_compare is None:
        features_to_compare = ["text_length", "sentence_count", "average_word_length"]

    features1 = stylistic_features(text1)
    features2 = stylistic_features(text2)
    similarities = []

    for feature in features_to_compare:
        if feature in features1 and feature in features2:
            similarity = 1 - abs(features1[feature] - features2[feature]) / max(features1[feature], features2[feature], 1)
            similarities.append(similarity)

    if similarities:
        average_similarity = np.mean(similarities)
    else:
        average_similarity = 0

    return average_similarity


def extract_structural_features(text):
    """
    Extracts structural features from a given text, including headers, bullet points, and numbered lists.

    Parameters:
    - text (str): The input text from which structural features are extracted.

    Returns:
    - dict: A dictionary containing the header count, bullet points count, and numbered list count.
    """
    features = {}
    # Count headers more robustly by considering all levels of markdown headers
    features["header_count"] = sum(1 for line in text.split("\n") if line.strip().startswith("#"))
    features["bullet_point_count"] = text.count("\n- ") + text.count("\n* ")

    # Enhanced handling for numbered lists including first and second tier (e.g., 1., a.)
    def is_numbered_list_item(line):
        parts = line.strip().split(". ", 1)
        if len(parts) == 2:
            first_part, _ = parts
            return first_part.isdigit() or (first_part.isalpha() and len(first_part) == 1)
        return False

    features["numbered_list_count"] = sum(1 for line in text.split("\n") if is_numbered_list_item(line))

    return features


def structural_similarity(text1, text2):
    """
    Calculates the structural similarity between two texts, considering headers, bullet points, and numbered lists.

    Parameters:
    - text1 (str): The first text.
    - text2 (str): The second text.

    Returns:
    - float: The overall structural similarity between the two texts.
    """
    features1 = extract_structural_features(text1)
    features2 = extract_structural_features(text2)

    header_similarity = 1 - abs(features1["header_count"] - features2["header_count"]) / max(features1["header_count"], features2["header_count"], 1)
    bullet_similarity = 1 - abs(features1["bullet_point_count"] - features2["bullet_point_count"]) / max(
        features1["bullet_point_count"], features2["bullet_point_count"], 1
    )
    numbered_list_similarity = 1 - abs(features1["numbered_list_count"] - features2["numbered_list_count"]) / max(
        features1["numbered_list_count"], features2["numbered_list_count"], 1
    )

    average_similarity = (header_similarity + bullet_similarity + numbered_list_similarity) / 3

    return average_similarity


def format_similarity(text1, text2):
    """
    Calculates a comprehensive format similarity between two texts, combining stylistic and structural scores, including the consideration of text length.

    Parameters:
    - text1 (str): The first text.
    - text2 (str): The second text.

    Returns:
    - float: The overall format similarity between the two texts.
    """
    features_to_compare = ["text_length", "sentence_count", "average_word_length"]
    stylistic_score = stylistic_similarity(text1, text2, features_to_compare[:])
    structural_score = structural_similarity(text1, text2)

    overall_score = stylistic_score * 0.3 + structural_score * 0.7
    return overall_score

In [2]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine


def BERTsimilarity(text1, text2):
    """
    Calculates the similarity between two texts using BERT embeddings.

    Parameters:
    - text1 (str): The first text.
    - text2 (str): The second text.

    Returns:
    - float: The similarity between the two texts.
    """
    model = SentenceTransformer("all-mpnet-base-v2")
    embedding1 = model.encode(text1)
    embedding2 = model.encode(text2)
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity

In [3]:
def compare_texts(text1, text2):
    features_to_compare = ["text_length", "sentence_count", "average_word_length", "punctuation_count"]
    print(f"Stylistic similarity: {stylistic_similarity(text1, text2, features_to_compare[:2])}")
    print(f"Structural similarity: {structural_similarity(text1, text2)}")
    print(f"Format similarity: {format_similarity(text1, text2)}")
    print(f"Bert similarity: {BERTsimilarity(text1, text2)}")

In [4]:
text1 = "The bottle is empty."
text2 = "There is nothing in the bottle."

compare_texts(text1, text2)

Stylistic similarity: 0.8225806451612903
Structural similarity: 1.0
Format similarity: 0.9625930521091811
Bert similarity: 0.8154221773147583


In [5]:
text1 = """
Sure, I'd love to share some useful information and tips about AI (Artificial Intelligence).

1. **Understanding AI**
   AI is a branch of computer science that aims to create systems capable of performing tasks that would require intelligence if done by humans. These tasks include learning, reasoning, problem-solving, perception, and language understanding.

2. **Types of AI**
   There are two types of AI: Narrow AI, which is designed to perform a narrow task (e.g. facial recognition or voice commands), and General AI, which can perform any intellectual task that a human being can do.

3. **Machine Learning**
   Machine Learning is a subset of AI, where the machine learns and improves from experience without being explicitly programmed. It's based on the idea that systems can learn from data, identify patterns, and make decisions with minimal human intervention.

4. **Neural Networks**
   Neural networks are a set of algorithms designed to recognize patterns. They interpret sensory data through a kind of machine perception, labeling or clustering raw input.

5. **Data is Key**
   The quality of the data used in AI is extremely important. The AI system will only be as good as the data it is trained on. Bias in data can lead to bias in AI systems.

6. **AI in Business**
   AI can be used in numerous business applications, like customer service (chatbots), talent recruiting, sales and marketing optimization, predictive maintenance, and many more.

7. **Ethical Considerations**
   As AI systems become more prevalent, ethical considerations come into play. These concerns cover privacy, job replacement by AI, and creating AI systems that behave in a way that is beneficial to humanity.

**Tips:**

1. **Stay Updated**: AI is a rapidly changing field. Keep learning and staying updated on the latest advancements.

2. **Practical Experience**: Try to get hands-on experience with AI. There are many online platforms like Coursera, Edx, and Udacity where you can learn and practice.

3. **Understand Data**: AI depends heavily on data. Understanding how to collect, clean, and interpret data is crucial.

4. **Programming Skills**: If you are interested in creating your own AI models, learning programming languages like Python, R, and Java is useful.

5. **Mathematics**: Knowledge in algebra, calculus, and statistics is essential to understand algorithms used in AI.

6. **Ethics**: Understand the ethical implications of AI and strive to use AI responsibly.

Here are a few links for further reading:

- AI basics: https://futureoflife.org/background/benefits-risks-of-artificial-intelligence/
- AI in Business: https://www.forbes.com/sites/forbestechcouncil/2020/05/04/the-role-of-artificial-intelligence-in-business/
- Ethical implications of AI: https://www.brookings.edu/research/why-ethical-ai-matters/
- AI online courses:
    - Coursera: https://www.coursera.org/courses?query=artificial%20intelligence
    - Edx: https://www.edx.org/course/subject/artificial-intelligence
    - Udacity: https://www.udacity.com/course/intro-to-artificial-intelligence--cs271
"""

text2 = """
AI (Artificial Intelligence) refers to the development of computer systems that can perform tasks that would typically require human intelligence. Here are some useful information and tips about AI, broken down into key aspects:

1. Types of AI:
   a. Narrow AI: Designed to perform specific tasks (e.g., virtual assistants, speech recognition, image recognition)
   b. General AI: Theoretical and not yet existent, it would possess the ability to perform any intellectual task that a human can do.

2. AI technologies and techniques:
   a. Machine Learning (ML): A subset of AI that allows computers to learn and improve from experience without explicit programming.
   b. Deep Learning: A subset of ML that uses artificial neural networks to model and solve complex problems.
   c. Natural Language Processing (NLP): Enables computers to interpret, understand, and generate human language.
   d. Computer Vision: Deals with the acquisition, analysis, and understanding of images and videos.

3. AI use cases:
   a. Healthcare: Diagnostics, personalized medicine, drug discovery.
   b. Finance: Fraud detection, risk management, trading algorithms.
   c. Retail: Personalized recommendations, inventory management, chatbots.
   d. Manufacturing: Predictive maintenance, quality control, robotics.
   e. Transportation: Autonomous vehicles, traffic management, route optimization.

4. Tips for implementing AI:
   a. Define clear objectives: Understand the specific problems you want AI to solve and set measurable goals.
   b. Collect and manage high-quality data: AI algorithms rely on accurate and representative data to perform well.
   c. Choose the right algorithms and tools: Experiment with different ML algorithms and tools to find the best fit for your needs.
   d. Collaborate with experts: Work with AI and domain experts to ensure your AI project is on the right track.
   e. Monitor and update models: Continuously evaluate AI models' performance and update them to maintain accuracy and relevance.

5. Ethical considerations:
   a. Bias and fairness: Be aware of potential biases in data and algorithms, and strive for fairness in AI applications.
   b. Transparency and explainability: Ensure AI models can be understood and explained to stakeholders.
   c. Privacy and security: Protect user data and comply with data protection regulations.
   d. Accountability: Establish clear responsibilities for AI systems and their outcomes.

6. Popular AI platforms and frameworks:
   a. TensorFlow (https://www.tensorflow.org/)
   b. Keras (https://keras.io/)
   c. PyTorch (https://pytorch.org/)
   d. scikit-learn (https://scikit-learn.org/)

By understanding these aspects of AI, you can better navigate the world of artificial intelligence and make informed decisions when implementing AI solutions.
"""

compare_texts(text1, text2)

Stylistic similarity: 0.8773119502324492
Structural similarity: 0.4777777777777778
Format similarity: 0.6058999793067013
Bert similarity: 0.8462386727333069


In [6]:
text1 = """
Gradient descent is a first-order iterative optimization algorithm for finding the minimum of a function. It's often used in machine learning and data science to find the parameters that minimize a particular cost function. 
 
Here's a more detailed explanation: 
 
To start with, you have a cost or loss function, which measures how well your algorithm performs. This function depends on your algorithm's parameters. You want to adjust the parameters to minimize the cost function. But the function is complex and it's not clear how to adjust the parameters to make the cost lower. 
 
The idea behind gradient descent is to look at the slope of the cost function at your current point. The slope, or gradient, points in the direction of the steepest increase in the function. So if you adjust your parameters in the opposite direction to the gradient, you'll move towards a minimum of the function. 
 
You iteratively adjust your parameters in the direction of steepest descent, until you hopefully reach the global minimum of the function. There's a parameter called the learning rate which controls how big a step you take at each iteration. 
 
One challenge with gradient descent is that it can get stuck in local minima - points where the function is lower than all nearby points, but not lower than some other point further away. There are various versions of gradient descent (like stochastic gradient descent or mini-batch gradient descent) that try to overcome this issue. 
"""

text2 = """
Gradient descent is an optimization algorithm used in machine learning and deep learning to minimize a loss function by iteratively moving towards the minimum value. It is a popular method for training artificial neural networks, and is particularly useful for solving problems where the objective function has multiple local minima. 
 
The main idea behind gradient descent is to determine the direction in which the function decreases the steepest at each point in the parameter space, and then adjust the parameters (such as weights and biases in a neural network) accordingly. This is done by calculating the gradient of the loss function with respect to each parameter. The gradient is a vector of partial derivatives that points in the direction of the greatest increase of the function. By taking steps in the opposite direction of the gradient, we aim to move towards the minimum of the function. 
 
There are three main types of gradient descent: 
 
1. Batch gradient descent: This method computes the gradient using the entire dataset, and updates the parameters accordingly. This can be computationally expensive, especially for large datasets. 
 
2. Stochastic gradient descent (SGD): This method computes the gradient and updates the parameters using a single training example at a time. This can be faster than batch gradient descent, but may result in a more erratic path towards the minimum due to the noisy nature of the individual training examples. 
 
3. Mini-batch gradient descent: This method is a compromise between batch gradient descent and SGD. It computes the gradient and updates the parameters using a subset (mini-batch) of the training examples. This can provide a balance between computational efficiency and convergence stability. 
 
Gradient descent has a few hyperparameters, such as the learning rate, which determines the step size in the parameter updates. Choosing the right learning rate is crucial for convergence to the global minimum. If the learning rate is too small, the algorithm will take too long to converge; if it's too large, the algorithm may overshoot the minimum and fail to converge. 
 
In summary, gradient descent is an optimization algorithm used to minimize a loss function by iteratively adjusting the parameters in the direction of the steepest decrease, allowing for the training of machine learning models like artificial neural networks. 
"""

compare_texts(text1, text2)

Stylistic similarity: 0.6335239085239086
Structural similarity: 0.6666666666666666
Format similarity: 0.6876534114283117
Bert similarity: 0.8388808965682983


In [7]:
text1 = """
### Introduction

In the realm of economics, understanding individuals' attitudes towards risk is crucial for explaining various economic behaviors, especially in the context of investment, consumption, and insurance decisions. The concept of risk aversion and its relationship with the utility of income plays a pivotal role in this understanding. I will discuss whether an individual being risk-averse is synonymous with having diminishing marginal utility of income.

### Key Concepts

- **Risk Aversion**: A risk-averse individual prefers to avoid uncertainty, choosing a certain outcome over a gamble with a potentially higher expected value. This trait is quantitatively measured by the curvature of the utility function: a concave utility function indicates risk aversion.
- **Utility**: Utility represents satisfaction or happiness that a consumer obtains from consumption of goods and services. The utility function maps levels of wealth or income to a real number indicating levels of utility.
- **Marginal Utility of Income**: This is the additional satisfaction or utility a consumer gains from receiving an additional unit of income. It is represented by the first derivative of the utility function with respect to income.

### Proofs

#### Proposition: An individual is risk averse if and only if he/she has diminishing marginal utility of income.

- **Forward Direction (If an individual is risk-averse, then he/she has diminishing marginal utility of income)**: To demonstrate this, consider a utility function \(U(I)\), where \(I\) represents income. Risk aversion implies that \(U(I)\) is concave. Mathematically, a function is concave if its second derivative is negative, i.e., \(U''(I) < 0\). The negative second derivative indicates that the slope of the utility function (the marginal utility of income) is decreasing in income, which by definition means diminishing marginal utility of income.

- **Reverse Direction (If an individual has diminishing marginal utility of income, then he/she is risk-averse)**: Assuming that the marginal utility of income diminishes as income increases (\(U''(I) < 0\)), it follows that the utility function is concave. A concave utility function means that the individual prefers a certain outcome over a gamble with the same expected income, which is the definition of risk aversion.

### Examples

Consider an individual with a utility function \(U(I) = \sqrt{I}\), where \(I\) is income.

- To check for concavity, we derive the first and second derivatives of \(U(I)\):
  - The first derivative \(U'(I) = \frac{1}{2\sqrt{I}}\), which is positive and decreases as \(I\) increases, indicating that marginal utility diminishes with income.
  - The second derivative \(U''(I) = -\frac{1}{4I\sqrt{I}}\), which is negative, indicating that \(U(I)\) is concave, and thus the individual is risk-averse.

This utility function exhibits both diminishing marginal utility of income and risk aversion, consistent with our proposition.

### Conclusion

The proposition that an individual is risk averse if and only if he/she has diminishing marginal utility of income holds true under the standard assumptions of utility theory. The diminishing marginal utility of income, represented by a concave utility function, inherently captures the essence of risk aversion. This relationship is foundational in economics, helping to explain why individuals make certain financial decisions, including savings, insurance, and investment choices, based on their levels of income and attitudes towards risk.
"""

text2 = """
### Introduction

The relationship between risk aversion and diminishing marginal utility of income is a fundamental concept in both economics and decision theory. To explore whether an individual is risk averse if and only if they have diminishing marginal utility of income, we need to delve into the definitions and theories underpinning these concepts.

### Key Concepts

**Risk Aversion**: A risk-averse individual prefers a certain outcome over a gamble with an equivalent expected value. This preference indicates a concave utility function, where the utility gained from an incremental increase in wealth decreases as wealth increases.

**Marginal Utility of Income**: This refers to the additional satisfaction (utility) a person gains from receiving an additional unit of income. The concept of diminishing marginal utility suggests that as income increases, the additional satisfaction gained from an extra unit of income decreases.

**Utility Function**: A mathematical representation of an individual's preference. For risk-averse individuals, the utility function is concave due to diminishing marginal utility.

### Proofs

**Forward Direction (If an individual is risk averse, then they have diminishing marginal utility of income):**

- Given a risk-averse individual, their utility function \(U(W)\) is concave, where \(W\) is wealth.
- For a utility function to be concave, its second derivative must be negative \(U''(W) < 0\).
- A negative second derivative implies that the slope (or the marginal utility) of the utility function is decreasing as wealth increases.
- Hence, if an individual is risk averse (preferring certainty over equivalent risk), this implies that the individual has diminishing marginal utility of income.

**Reverse Direction (If an individual has diminishing marginal utility of income, then they are risk averse):**

- Assume an individual has a utility function \(U(W)\) with diminishing marginal utility, implying \(U''(W) < 0\).
- Consider a choice between a certain income \(W_c\) and a gamble with expected wealth \(E[W_g] = W_c\).
- Due to diminishing marginal utility, the utility of the expected wealth of the gamble \(U(E[W_g])\) is less than the expected utility of the gamble \(E[U(W_g)]\), because the concavity of \(U(W)\) means that the utility of average wealth is less than the average utility across different states of wealth.
- Therefore, the individual prefers the certain outcome \(W_c\) over the gamble, indicating risk aversion.

### Examples

**Example 1: Insurance Purchase**

- Assume an individual can either face a loss with some probability or pay an insurance premium to avoid the loss.
- A risk-averse individual, due to diminishing marginal utility of income, is willing to pay a premium (which is more than the expected loss) for insurance to avoid the risk, showing a preference for certainty.

**Example 2: Investment Choices**

- Consider two investment options: a risk-free government bond with a certain return and a risky stock with higher potential returns but also potential losses.
- An individual with diminishing marginal utility of income would prefer the bond over the stock, even if the expected returns are the same, due to the preference for certainty over risk.

### Conclusion

The propositions that an individual is risk averse if and only if they have diminishing marginal utility of income are supported both theoretically and through practical examples. The concavity of the utility function, indicating both risk aversion and diminishing marginal utility, serves as the crucial link between these concepts. Thus, these characteristics are inherently connected, providing a foundational principle in understanding economic behavior towards risk and income.
"""

compare_texts(text1, text2)

Stylistic similarity: 0.9315425531914894
Structural similarity: 0.7777777777777778
Format similarity: 0.8279582013923628
Bert similarity: 0.9533283710479736


In [8]:
text1 = """
### Sagittarius A*

Imagine standing in the heart of a bustling city, where every road and every pathway converges into a central square. This vivid scene is somewhat analogous to Sagittarius A* (pronounced "Sagittarius A-Star"), the supermassive black hole residing at the Milky Way's core. It's like the downtown of our galaxy, where an immense gravitational pull attracts stars, gas, and cosmic dust, orchestrating a cosmic ballet around this invisible yet profound entity.

Sagittarius A* is a colossus, harboring about 4 million times the mass of our Sun. To put that into perspective, if the Sun were a single grain of sand, Sagittarius A* would be a large boulder weighing as much as 4 million grains of sand! Despite its mass, it's relatively compact, fitting within a region much smaller than our solar system.

Located approximately 26,000 light-years from Earth, Sagittarius A* is relatively close in cosmic terms. A light-year is the distance light travels in a year, about 9.46 trillion kilometers (or about 5.88 trillion miles). So, when we peer at the center of our galaxy, we're looking back in time 26,000 years, witnessing the echoes of a distant past.

### TON 618

Now, let's journey far beyond the Milky Way to encounter TON 618, one of the most massive black holes known to humanity. This celestial titan is like a cosmic giant among ants when compared to our local supermassive black hole, Sagittarius A*.

TON 618 is a quasar, an extraordinarily bright and distant celestial object powered by a supermassive black hole surrounded by a disk of gas and dust that spirals into it, releasing incredible amounts of energy. The black hole at the heart of TON 618 is a staggering 66 billion times the mass of the Sun. Imagine if each grain of sand on a beach represented a sun; TON 618 would then be equivalent to a mountain of sand, each grain a sun in its own right.

Located about 10.4 billion light-years from Earth, TON 618 shines from the dawn of the universe, its light taking over 10 billion years to reach us. Observing TON 618 is like looking through a time machine, seeing the universe as it was when it was less than a third of its current age.

The enormity of TON 618 and its quasar's brilliance illuminate the depths of our universe, showcasing the incredible diversity and scale of cosmic phenomena. From the bustling streets of our galactic center, Sagittarius A*, to the distant and ancient light of TON 618, the universe invites us to marvel at its wonders, both near and far.

Through these comparisons, we bridge the gap between the vast, incomprehensible scale of the cosmos and our everyday experiences, making the wonders of the universe more accessible and truly awe-inspiring.
"""

text2 = """
### Sagittarius A*

Imagine standing at the heart of our Milky Way galaxy. Here, amidst a dazzling sea of stars, lies a cosmic behemoth—a supermassive black hole known as Sagittarius A* (pronounced "Sagittarius A-star"). This celestial giant is invisible in the regular spectrum of light, but astronomers can detect its presence by observing the stars and gas clouds swirling around it, much like water circling a drain before plunging in.

Now, to grasp the sheer scale of Sagittarius A*, let's start with some numbers. It's about 26,000 light-years away from us. In more familiar terms, if you could travel at the speed of light (about 670 million mph), it would still take you 26,000 years to get there from Earth. Sagittarius A* has a mass roughly 4 million times that of our Sun. Imagine our Sun, which accounts for about 99.86% of the Solar System's mass, and then multiply that heft by 4 million. Yet, for all its mass, the event horizon (the point beyond which nothing can escape its gravitational pull) is only about 17 times wider than the Sun. If you could somehow replace the Sun in our solar system with Sagittarius A*, it would fit comfortably inside the orbit of Mercury with vast room to spare.

### TON 618

Now, let's journey beyond our galaxy to meet a true cosmic titan, TON 618. This is a quasar, the incredibly bright core of a galaxy where a supermassive black hole resides, actively devouring material and spewing out immense amounts of energy. TON 618 is one of the largest black holes ever discovered, and it's located an astonishing 10.4 billion light-years from Earth. To give you a sense of this distance: if you started traveling to TON 618 at the speed of light just after the Earth was formed, you wouldn't even be halfway there by now.

TON 618 has a mass about 66 billion times that of our Sun. To put that in perspective, it's as if you combined the mass of every star in the Milky Way, then did it again multiple times. Despite its mass, we can't directly see TON 618. What we actually observe is the quasar it powers, shining as one of the brightest objects in the universe due to the superheated material spiraling into the black hole's maw. The scale of this luminosity is hard to fathom, but imagine the Sun, then picture something billions of times brighter, and you start to get the idea.

In summary, both Sagittarius A* and TON 618 serve as awe-inspiring examples of the universe's extremes. From the relatively modest yet still mind-boggling supermassive black hole at the center of our Milky Way to the staggering, almost incomprehensible scale of TON 618, these cosmic phenomena challenge our understanding and expand our sense of the vastness that surrounds us. Through them, we glimpse not only the immense diversity of the cosmos but also the incredible scales over which it operates—reminding us of the universe's endless capacity to astonish and inspire.
"""

compare_texts(text1, text2)

Stylistic similarity: 0.905347317744154
Structural similarity: 1.0
Format similarity: 0.9792198294836278
Bert similarity: 0.8774768114089966
