In [16]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import itertools
import time
from collections import Counter
def make_features(str1, str2, equal, operation):

    # Lots of features that may be important, think about adding more specific cases, same leading or tailing chars? 
    # These are the basic ones tho
    features = {}

    features['str1 len'] = len(str1)
    features['str2 len'] = len(str2)
    features['equal len'] = len(equal)

    features['all len'] = features['str1 len'] + features['str2 len'] + features['equal len']

    overallCount = Counter(str1 + str2 + equal)
    features['unique char'] = len(overallCount)
    features['max occurences'] = max(overallCount.values())
    features['min occurences'] = min(overallCount.values())

    features['reuse ratio'] = len(str1 + str2 + equal) / len(overallCount) if overallCount != 0 else 0
    
    if operation == '+':
        features['operation'] = '+'
    elif operation == '*':
        features['operation'] = '*'
    elif operation == '/':
        features['operation'] = '/'
    else:
        features['operation'] = '*'
    
    str1Count = Counter(str1)
    str2Count = Counter(str2)
    equalCount = Counter(equal)
    sumCount = Counter(str1 + str2)

    features['unique str1 char'] = len(str1Count)
    features['unique str2 char'] = len(str2Count)
    features['unique equal char'] = len(str2Count)
    features['unique 1+2 char'] = len(sumCount)


    features['max str1 occurences'] = max(str1Count.values())
    features['min str1 occurences'] = min(str1Count.values())

    features['max str2 occurences'] = max(str2Count.values())
    features['min str2 occurences'] = min(str2Count.values())
    features['max equal occurences'] = max(equalCount.values())
    features['min equal occurences'] = min(equalCount.values())

    features['max 1+2 occurences'] = max(sumCount.values())
    features['min 1+2 occurences'] = min(sumCount.values())

    features['crypto complexity'] = features['all len'] * features['unique char']
    print(features)

    return features
    


In [17]:
def generate_training_data(num_samples=5000):
    # Need to some how create synthetic training data.
    data = []
    labels = []
    
    valid_puzzles = [
        ("SEND", "MORE", "MONEY", "+"),
        ("TWO", "TWO", "FOUR", "+"),
        ("ABC", "DEF", "GHIJ", "+"),
        ("CAB", "CAB", "DEED", "+"),
        ("EAT", "THAT", "APPLE", "+"),
    ]
    
    for str1, str2, result, op in valid_puzzles:
        features = make_features(str1, str2, result, op)
        data.append(features)
        labels.append(1)
    
    
    operations = ['+', '-', '*', '/']

    
    

In [18]:
make_features("I", "Love", "You", '+')

{'str1 len': 1, 'str2 len': 4, 'equal len': 3, 'all len': 8, 'unique char': 7, 'max occurences': 2, 'min occurences': 1, 'reuse ratio': 1.1428571428571428, 'operation': '+', 'unique str1 char': 1, 'unique str2 char': 4, 'unique equal char': 4, 'unique 1+2 char': 5, 'max str1 occurences': 1, 'min str1 occurences': 1, 'max str2 occurences': 1, 'min str2 occurences': 1, 'max equal occurences': 1, 'min equal occurences': 1, 'max 1+2 occurences': 1, 'min 1+2 occurences': 1, 'crypto complexity': 56}


{'str1 len': 1,
 'str2 len': 4,
 'equal len': 3,
 'all len': 8,
 'unique char': 7,
 'max occurences': 2,
 'min occurences': 1,
 'reuse ratio': 1.1428571428571428,
 'operation': '+',
 'unique str1 char': 1,
 'unique str2 char': 4,
 'unique equal char': 4,
 'unique 1+2 char': 5,
 'max str1 occurences': 1,
 'min str1 occurences': 1,
 'max str2 occurences': 1,
 'min str2 occurences': 1,
 'max equal occurences': 1,
 'min equal occurences': 1,
 'max 1+2 occurences': 1,
 'min 1+2 occurences': 1,
 'crypto complexity': 56}