In [336]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neural_network import MLPRegressor

## Data Generation

In [337]:


def generate_coloring(N):
    """Generate a random 2-coloring for integers from 1 to N."""
    colors = ['blue', 'red']
    return [random.choice(colors) for _ in range(N)]

def generate_sample_dataset(num_samples, N, batch_size=1000):
    """Generate a sample dataset of colorings."""
    dataset = []
    batches = num_samples // batch_size
    remainder = num_samples % batch_size
    for _ in range(batches):
        batch = [generate_coloring(N) for _ in range(batch_size)]
        dataset.extend(batch)
    if remainder:
        dataset.extend([generate_coloring(N) for _ in range(remainder)])
    return dataset


num_samples = 50000
N = 100 # Maximum integer value to be colored
sample_dataset = generate_sample_dataset(num_samples, N)

# first 5 colorings in the dataset
for i in range(10):
    print(f"Sample {i + 1}: {sample_dataset[i]}")

Sample 1: ['red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'red', 'red', 'blue', 'red', 'red', 'red', 'red', 'red', 'blue', 'red', 'blue', 'blue', 'red', 'blue', 'blue', 'blue', 'blue', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'blue', 'blue', 'blue', 'red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'blue', 'blue', 'red', 'red', 'blue', 'red', 'blue', 'red', 'red', 'red', 'red', 'blue', 'red', 'blue', 'red', 'blue', 'blue', 'red', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'blue', 'blue', 'red', 'red', 'blue', 'blue', 'red', 'red', 'blue', 'blue', 'red', 'red', 'blue', 'red', 'red', 'red', 'blue', 'red', 'blue', 'red', 'red', 'blue', 'red', 'blue', 'blue']
Sample 2: ['blue', 'blue', 'blue', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'red', 'red', 'red', 'blue', 'blue', 'blue', 'red', 'red', 'blue', 'blue', 'blue', 'red', 'red', 'blue', 'red', 'blue', 'blue', 'blue', 'red', 'blu

## Fractions of Monochromatic Arithmetic Triples

In [339]:
def monochromatic_triplets_fraction(colors):
    """Calculate the fraction of monochromatic arithmetic progressions of length 3."""
    n = len(colors)
    monochromatic_ap = 0
    total_ap = 0
    
    # Iterate over all possible triplets (i, j, k) that form an arithmetic progression
    for i in range(n):
        for j in range(i + 1, n):
            diff = j - i
            k = j + diff
            if k < n:
                #print(f"Indices: ({i}, {j}, {k}) -> Colors: ({colors[i]}, {colors[j]}, {colors[k]})")
                total_ap += 1
                if colors[i] == colors[j] == colors[k]:
                    monochromatic_ap += 1
    
    # Calculate the fraction
    if total_ap != 0:
        fraction_monochromatic_ap = monochromatic_ap / total_ap
    else:
        fraction_monochromatic_ap = 0
    
    return fraction_monochromatic_ap

# Applying the function on the sample dataset
sample_monochromatic_triples_fractions = [monochromatic_triplets_fraction(colors) for colors in sample_dataset]

# Print calculated fractions
for i in range(5):
    print(f"Sample {i + 1}: Fraction of monochromatic triplets: {sample_monochromatic_triples_fractions[i]}")


Sample 1: Fraction of monochromatic triplets: 0.24122448979591837
Sample 2: Fraction of monochromatic triplets: 0.24816326530612245
Sample 3: Fraction of monochromatic triplets: 0.2530612244897959
Sample 4: Fraction of monochromatic triplets: 0.24
Sample 5: Fraction of monochromatic triplets: 0.2563265306122449


In [340]:
df1 = pd.DataFrame({'sample_monochromatic_triples_fractions':sample_monochromatic_triples_fractions})

In [341]:
df1.head(5)

Unnamed: 0,sample_monochromatic_triples_fractions
0,0.241224
1,0.248163
2,0.253061
3,0.24
4,0.256327


In [342]:
df1.describe()

Unnamed: 0,sample_monochromatic_triples_fractions
count,50000.0
mean,0.250115
std,0.01284
min,0.22898
25%,0.241633
50%,0.246531
75%,0.255102
max,0.403265


## Fractions of Monochromatic quadruples (4,4)

In [345]:
def monochromatic_quadruples_fraction(colors):
    """Calculate the fraction of monochromatic arithmetic progressions of length 4."""
    n = len(colors)
    monochromatic_ap = 0
    total_ap = 0
    
    # Iterate over all possible quadruples (i, j, k, l) that form an arithmetic progression
    for i in range(n):
        for j in range(i + 1, n):
            diff = j - i
            k = j + diff
            l = k + diff
            if l < n:
               # print(f"Indices: ({i}, {j}, {k}, {l}) -> Colors: ({colors[i]}, {colors[j]}, {colors[k]}, {colors[l]})")
                total_ap += 1
                if colors[i] == colors[j] == colors[k] == colors[l]:
                    monochromatic_ap += 1
    
    # Calculate the fraction
    if total_ap != 0:
        fraction_monochromatic_ap = monochromatic_ap / total_ap
    else:
        fraction_monochromatic_ap = 0
    
    return fraction_monochromatic_ap

# Applying the function on the sample dataset
sample_monochromatic_quadruples_fractions = [monochromatic_quadruples_fraction(colors) for colors in sample_dataset]

# Print calculated fractions
for i in range(5):
    print(f"Sample {i + 1}: Fraction of monochromatic quadruples: {sample_monochromatic_quadruples_fractions[i]}")


Sample 1: Fraction of monochromatic quadruples: 0.1150278293135436
Sample 2: Fraction of monochromatic quadruples: 0.1280148423005566
Sample 3: Fraction of monochromatic quadruples: 0.1292517006802721
Sample 4: Fraction of monochromatic quadruples: 0.11069882498453927
Sample 5: Fraction of monochromatic quadruples: 0.12987012987012986


## Fractions of Monochromatic arithmetic quintuples (5,5)

In [347]:
def monochromatic_quintuples_fraction(colors):
    """Calculate the fraction of monochromatic arithmetic progressions of length 5."""
    n = len(colors)
    monochromatic_ap = 0
    total_ap = 0
    
    # Iterate over all possible quintuples (i, j, k, l, m) that form an arithmetic progression
    for i in range(n):
        for j in range(i + 1, n):
            diff = j - i
            k = j + diff
            l = k + diff
            m = l + diff
            if m < n:
                #print(f"Indices: ({i}, {j}, {k}, {l}, {m}) -> Colors: ({colors[i]}, {colors[j]}, {colors[k]}, {colors[l]}, {colors[m]})")
                total_ap += 1
                if colors[i] == colors[j] == colors[k] == colors[l] == colors[m]:
                    monochromatic_ap += 1
    
    # Calculate the fraction
    if total_ap != 0:
        fraction_monochromatic_ap = monochromatic_ap / total_ap
    else:
        fraction_monochromatic_ap = 0
    
    return fraction_monochromatic_ap

# Sample dataset
# sample_dataset = [
#     ['red', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue'],
#     ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red','blue'],
#     ['blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue'],
#     ['red', 'red', 'blue', 'blue', 'red', 'red', 'blue', 'blue', 'red', 'red'],
#     ['red', 'blue', 'blue', 'red', 'blue', 'red', 'red', 'blue', 'blue', 'red']
# ]

# Applying the function on the sample dataset
sample_monochromatic_quintuples_fractions = [monochromatic_quintuples_fraction(colors) for colors in sample_dataset]

# Print calculated fractions
for i in range(5):
    print(f"Sample {i + 1}: Fraction of monochromatic quintuples: {sample_monochromatic_quintuples_fractions[i]}")


Sample 1: Fraction of monochromatic quintuples: 0.055
Sample 2: Fraction of monochromatic quintuples: 0.06166666666666667
Sample 3: Fraction of monochromatic quintuples: 0.06666666666666667
Sample 4: Fraction of monochromatic quintuples: 0.04666666666666667
Sample 5: Fraction of monochromatic quintuples: 0.06583333333333333


## Monochromatic Sextuple (6,6)

In [350]:
def monochromatic_sextuples_fraction(colors):
    """Calculate the fraction of monochromatic arithmetic progressions of length 6."""
    n = len(colors)
    monochromatic_ap = 0
    total_ap = 0
    
    # Iterate over all possible sextuples (i, j, k, l, m, n) that form an arithmetic progression
    for i in range(n):
        for j in range(i + 1, n):
            diff = j - i
            k = j + diff
            l = k + diff
            m = l + diff
            o = m + diff
            if o < n:
                # print(f"Indices: ({i}, {j}, {k}, {l}, {m}, {o}) -> Colors: ({colors[i]}, {colors[j]}, {colors[k]}, {colors[l]}, {colors[m]}, {colors[o]})")
                total_ap += 1
                if colors[i] == colors[j] == colors[k] == colors[l] == colors[m] == colors[o]:
                    monochromatic_ap += 1
    
    # Calculate the fraction
    if total_ap != 0:
        fraction_monochromatic_ap = monochromatic_ap / total_ap
    else:
        fraction_monochromatic_ap = 0
    
    return fraction_monochromatic_ap

# # Sample dataset
# sample_dataset = [
#     ['red', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue'],
#     ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'blue'],
#     ['blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue'],
#     ['red', 'red', 'blue', 'blue', 'red', 'red', 'blue', 'blue', 'red', 'red'],
#     ['red', 'blue', 'blue', 'red', 'blue', 'red', 'red', 'blue', 'blue', 'red']
# ]

# Applying the function on the sample dataset
sample_monochromatic_sextuples_fractions = [monochromatic_sextuples_fraction(colors) for colors in sample_dataset]

# Print calculated fractions
for i in range(5):
    print(f"Sample {i + 1}: Fraction of monochromatic sextuples: {sample_monochromatic_sextuples_fractions[i]}")


Sample 1: Fraction of monochromatic sextuples: 0.02736842105263158
Sample 2: Fraction of monochromatic sextuples: 0.03263157894736842
Sample 3: Fraction of monochromatic sextuples: 0.031578947368421054
Sample 4: Fraction of monochromatic sextuples: 0.016842105263157894
Sample 5: Fraction of monochromatic sextuples: 0.02736842105263158


In [351]:
np.mean(sample_monochromatic_sextuples_fractions)

0.03131105263157895

## Monochromatic septuples (7,7)

In [353]:
def monochromatic_septuples_fraction(colors):
    """Calculate the fraction of monochromatic arithmetic progressions of length 7."""
    n = len(colors)
    monochromatic_ap = 0
    total_ap = 0
    
    # Iterate over all possible septuples (i, j, k, l, m, o, p) that form an arithmetic progression
    for i in range(n):
        for j in range(i + 1, n):
            diff = j - i
            k = j + diff
            l = k + diff
            m = l + diff
            o = m + diff
            p = o + diff
            if p < n:
                total_ap += 1
                if colors[i] == colors[j] == colors[k] == colors[l] == colors[m] == colors[o] == colors[p]:
                    monochromatic_ap += 1
    
    # Calculate the fraction
    if total_ap != 0:
        fraction_monochromatic_ap = monochromatic_ap / total_ap
    else:
        fraction_monochromatic_ap = 0
    
    return fraction_monochromatic_ap

# # Sample dataset
# sample_dataset = [
#     ['red', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue', 'red', 'blue'],
#     ['red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'red', 'blue','blue','blue'],
#     ['blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue', 'blue'],
#     ['red', 'red', 'blue', 'blue', 'red', 'red', 'blue', 'blue', 'red', 'red'],
#     ['red', 'blue', 'blue', 'red', 'blue', 'red', 'red', 'blue', 'blue', 'red','red']
# ]

# Applying the function on the sample dataset
sample_monochromatic_septuples_fractions = [monochromatic_septuples_fraction(colors) for colors in sample_dataset]

# Print calculated fractions
for i in range(5):
    print(f"Sample {i + 1}: Fraction of monochromatic septuples: {sample_monochromatic_septuples_fractions[i]}")


Sample 1: Fraction of monochromatic septuples: 0.01403061224489796
Sample 2: Fraction of monochromatic septuples: 0.015306122448979591
Sample 3: Fraction of monochromatic septuples: 0.012755102040816327
Sample 4: Fraction of monochromatic septuples: 0.002551020408163265
Sample 5: Fraction of monochromatic septuples: 0.011479591836734694


In [354]:
np.mean(sample_monochromatic_septuples_fractions)

0.015652270408163266

## Anti-Symmetry

In [355]:
def anti_symmetry(colors):
    """Calculate the fraction of opposite elements with different colors."""
    n = len(colors)
    num_opposite_different_colors = 0
    total_opposite_pairs = n // 2  # Total number of opposite pairs
    
    for i in range(total_opposite_pairs):
        if colors[i] != colors[n - 1 - i]:  # Check if opposite elements have different colors
            num_opposite_different_colors += 1
    
    # Calculate the fraction
    if total_opposite_pairs != 0:
        fraction_anti_symmetry = num_opposite_different_colors / total_opposite_pairs
    else:
        fraction_anti_symmetry = 0
    
    return fraction_anti_symmetry


# Applying the function on the sample dataset
sample_anti_symmetry = [anti_symmetry(colors) for colors in sample_dataset]

# Print first 5 calculated fractions
for i in range(min(5, len(sample_anti_symmetry))):
    print(f"Sample {i + 1}: {sample_anti_symmetry[i]:.4f}")


Sample 1: 0.6200
Sample 2: 0.5600
Sample 3: 0.3800
Sample 4: 0.6000
Sample 5: 0.4400


## Proportions of more popular color

In [356]:

def calculate_proportion_more_popular_color(colors):
    """Calculate the proportion of the more popular color."""
    # Count occurrences of each color
    counts = {color: colors.count(color) for color in set(colors)}
    
    # Determine the more popular color
    more_popular_color = max(counts, key=counts.get)
    
    # Calculate proportion of the more popular color
    proportion_more_popular_color = counts[more_popular_color] / len(colors)
    
    return proportion_more_popular_color, more_popular_color

# Applying the function on the sample dataset
sample_proportions = [calculate_proportion_more_popular_color(colors) for colors in sample_dataset]

# first 5 calculated proportions and corresponding colors
for i in range(5):
    proportion, color = sample_proportions[i]
    print(f"Sample {i + 1}: Proportion of the more popular color: {proportion}, The more popular color is: {color}")

Sample 1: Proportion of the more popular color: 0.53, The more popular color is: red
Sample 2: Proportion of the more popular color: 0.56, The more popular color is: blue
Sample 3: Proportion of the more popular color: 0.55, The more popular color is: blue
Sample 4: Proportion of the more popular color: 0.5, The more popular color is: red
Sample 5: Proportion of the more popular color: 0.58, The more popular color is: red


In [357]:
# Extracting only the proportions from the sample_proportions
sample_popular_proportions_only = [proportions for proportions, _ in sample_proportions]

# first 5 calculated proportions
for i in range(5):
    print(f"Sample {i + 1}: Proportion of the more popular color: {sample_popular_proportions_only[i]}")

Sample 1: Proportion of the more popular color: 0.53
Sample 2: Proportion of the more popular color: 0.56
Sample 3: Proportion of the more popular color: 0.55
Sample 4: Proportion of the more popular color: 0.5
Sample 5: Proportion of the more popular color: 0.58


## Transition Frequency

In [358]:
def calculate_color_transition_frequency(colors):
    """Calculate the frequency of transitions between different colors."""
    transitions = 0
    for i in range(1, len(colors)):
        if colors[i] != colors[i - 1]:
            transitions += 1
    return transitions/200

In [359]:
# Applying the function on the sample dataset
color_transition = [calculate_color_transition_frequency(colors) for colors in sample_dataset]

# first 5 calculated transition frequencies
for i in range(5):
    print(f"Sample {i + 1}: Transition frequency: {color_transition[i]}")


Sample 1: Transition frequency: 0.285
Sample 2: Transition frequency: 0.265
Sample 3: Transition frequency: 0.255
Sample 4: Transition frequency: 0.26
Sample 5: Transition frequency: 0.225


## Blockiness: Longest block

In [360]:
def blockiness(colors):
    """Calculate the blockiness of the coloring."""
    block_count = 0  # Initialize block count
    max_block_length = 0  # Initialize maximum block length
    current_block_length = 0  # Initialize current block length
    prev_color = None  # Initialize previous color tracker

    for color in colors:
        if color == prev_color:
            current_block_length += 1  # Increment current block length if the same color continues
        else:
            if current_block_length > max_block_length:
                max_block_length = current_block_length  # Update maximum block length if current block is longer
            current_block_length = 1  # Reset current block length for a new color block
            block_count += 1  # Increment block count
            prev_color = color  # Update previous color to current color

    # Check the last block
    if current_block_length > max_block_length:
        max_block_length = current_block_length  # Update maximum block length if the last block is the longest

    # Calculate blockiness as the ratio of the longest block length to the total number of colors
    return max_block_length / len(colors)

# Calculate blockiness for each coloring sequence in the sample dataset
blockiness_values = [blockiness(colors) for colors in sample_dataset]

# Print the first 5 blockiness values
for i in range(5):
    print(f"Sample {i + 1}: Blockiness: {blockiness_values[i]}")


Sample 1: Blockiness: 0.09
Sample 2: Blockiness: 0.06
Sample 3: Blockiness: 0.06
Sample 4: Blockiness: 0.05
Sample 5: Blockiness: 0.09


## Proportions of Red and Blue

In [361]:
def proportion_of_color(colors, color):
    """Calculate the proportion of a specific color in the coloring."""
    count = colors.count(color)
    total = len(colors)
    return count / total if total > 0 else 0

def proportion_of_blue(colors):
    return proportion_of_color(colors, 'blue')

def proportion_of_red(colors):
    return proportion_of_color(colors, 'red')

# Applying the functions to the sample dataset
proportion_blue = [proportion_of_blue(colors) for colors in sample_dataset]
proportion_red = [proportion_of_red(colors) for colors in sample_dataset]
#proportion_green = [proportion_of_green(colors) for colors in sample_dataset]

# Print first 5 calculated proportions for each color
print("Proportion of Blue:")
for i in range(5):
    print(f"Sample {i + 1}: {proportion_blue[i]}")

print("\nProportion of Red:")
for i in range(5):
    print(f"Sample {i + 1}: {proportion_red[i]}")

Proportion of Blue:
Sample 1: 0.47
Sample 2: 0.56
Sample 3: 0.55
Sample 4: 0.5
Sample 5: 0.42

Proportion of Red:
Sample 1: 0.53
Sample 2: 0.44
Sample 3: 0.45
Sample 4: 0.5
Sample 5: 0.58


## Length Training Data

In [None]:
#length train
length = [3] * 100000 + [4] * 100000+ [5] * 100000 + [6]*100000 + [7] * 100000

## Length Test Set

In [363]:
# length_test
length_test = [7]*50000

## DataFrame

In [364]:
train_data = pd.DataFrame(
{
'anti_symmetry':sample_anti_symmetry,
#'proportion_blue':proportion_blue,
'proportion_red':proportion_red,
'blockiness_values':blockiness_values,
'transition_frequency':color_transition,
'popular_proportions':sample_popular_proportions_only
}
)

## Squared distances Weighting

In [None]:
# Calculate the mean for each column
means = train_data[['transition_frequency', 'anti_symmetry']].mean()

# Calculate the distance
train_data['transition_frequency_squared_distance'] = (train_data['transition_frequency'] - means['transition_frequency'])
train_data['anti_symmetry_squared_distance'] = (train_data['anti_symmetry'] - means['anti_symmetry'])


In [366]:
train_data.shape

(50000, 7)

## Datasplit

In [367]:
test_data = train_data


In [368]:
# .

## Combined training data

In [None]:
combined_data = pd.concat([train_data]*5, axis = 0)

In [None]:
combined_data.head()

## Add length to training dataframe

In [372]:
test_data['length_test'] = length_test

In [None]:
combined_data['length'] = length

In [374]:
# combined_data.head()

In [375]:
test_data.shape

(50000, 8)

## Y training

In [None]:
fractions_list = [
    pd.Series(sample_monochromatic_triples_fractions, name='fractions', dtype=float).to_frame(),
    pd.Series(sample_monochromatic_quadruples_fractions, name='fractions', dtype=float).to_frame(),
    pd.Series(sample_monochromatic_quintuples_fractions, name='fractions', dtype=float).to_frame(),
    pd.Series(sample_monochromatic_sextuples_fractions, name='fractions', dtype=float).to_frame(),
    pd.Series(sample_monochromatic_septuples_fractions, name='fractions', dtype=float).to_frame()
]

# Concatenate the list of Series DataFrames
y_train = pd.concat(fractions_list, axis=0)

y_train.head()


Unnamed: 0,fractions
0,0.014031
1,0.015306
2,0.012755
3,0.002551
4,0.01148


In [377]:
y_train.mean()

fractions    0.015652
dtype: float64

In [385]:
1/(2**(3-1))

0.25

## Concatenated Training Data

In [None]:
# Concatenate side by side
final_train_df = pd.concat([test_data, y_train], axis=1)

In [380]:
final_train_df.head()

Unnamed: 0,anti_symmetry,proportion_red,blockiness_values,transition_frequency,popular_proportions,transition_frequency_squared_distance,anti_symmetry_squared_distance,length_test,fractions
0,0.62,0.53,0.09,0.285,0.53,0.037402,0.119938,7,0.014031
1,0.56,0.44,0.06,0.265,0.56,0.017402,0.059938,7,0.015306
2,0.38,0.45,0.06,0.255,0.55,0.007402,-0.120062,7,0.012755
3,0.6,0.5,0.05,0.26,0.5,0.012402,0.099938,7,0.002551
4,0.44,0.58,0.09,0.225,0.58,-0.022598,-0.060062,7,0.01148


In [None]:
final_train_df.to_csv('train_data.csv')

In [382]:
final_train_df.shape

(50000, 9)