In [None]:
# NOTES FROM MEETING
# No cell should take more than 20 minutes to run
# Please limit to one temporary collection (must be deleted at end of notebook). No external files
# Try to reduce single-line cells

# Mini-competition -
# Challenge: Limit to 50 lines of code (75 for team 1)! (excluding imports)
# Fit Success Rate: Fitting method with highest fit success rate
# Fit Average error: Fitting method with lowest fit error rate
# Fit Time complexity: Fitting method with lowest time expended
# Notebook housekeeping: Cleanest-looking notebook win!

# Sigmoid Criteria Curve Fitting: Algebraic Approach
**Contributors:** Justin Kaufman, Marco Scialanga

**Achievement:** Curve fitting with scipy. Curves used: x^2 / sqrt(1+x^2) (algebraic) and the generalized logistic function.

**Requirements:** 


## Imports
**Most important packages:** Pymongo, Scipy, Numpy, Pandas.

In [None]:
# Import packages you need
import sys
sys.path.append('../..')
import numpy as np
import matplotlib.pyplot as plt
from pymongo import MongoClient
import math
from scipy.optimize import curve_fit
from mlpp.data_modeling.sigmoid_fitting import *
from exploration.config import mongo_inst
import pandas as pd

## Connection with Compass
**Dataset:** osu_random_db.

In [None]:
# Instantiate your mongo instance, classes, and any config/global vars
client = MongoClient('localhost', 27017)
db = client.osu_random_db
osu_db = mongo_inst["osu_random_db"]

## Loading the Ids
**Collection:** beatmap_criteria_curve.

In [None]:
# Load the curve data
cursor = db["beatmap_criteria_curve"].find({},{"_id":1})
l = []
for el in cursor:
    l.append(el)
Ids = list(map(lambda x: x["_id"], l))

## Storing New Data
**Objective:** store information about our curve fits in the collection.

In [None]:
# Run all CDF curve fits & store in beatmap_criteria_curve collection
store_genLog(Ids, db)
store_alg(Ids, db)

## Create Success Rate Table 
**Objective:** create tables to compare success rate of curve fits.

In [None]:
# fit success-rate summary table with (3 rows 1 for each CDF fit) (4 columns 1* - 4*, 4* - 6*, 6* plus, all)
# will take around 3 minutes to run
collection = db["osu_beatmaps_attribs_modZero"]
db.attrib_17.insert_many(
    collection.aggregate([{'$match':{"beatmap_id": {'$in': Ids}, "attrib_id": 17}}]))

a = fit_lowDiff(db)
b = fit_mediumDiff(db)
c = fit_highDiff(db)
d = fit_all(db)

db["attrib_17"].drop()

dfSuccess = pd.DataFrame()
dfSuccess.insert(0, "Function", ["Generalized Logistic", "Algebraic"])
dfSuccess.insert(1, "SuccessRateLowDiff", [a[0], a[1]])
dfSuccess.insert(2, "SuccessRateMediumDiff", [b[0], b[1]])
dfSuccess.insert(3, "SuccessRateHighDiff", [c[0], c[1]])
dfSuccess.insert(4, "OverallSuccessRate", [d[0], d[1]])
dfSuccess

## Create Mean Squared Error Table 
**Objective:** create tables to compare success mean square error of curve fits.

In [None]:
# for curves that could be fit, error summary table with same layout as above
# we already have the data from the cell above, it is now just a matter of building the table
dfMse = pd.DataFrame()
dfMse.insert(0, "Function", ["Generalized Logistic", "Algebraic"])
dfMse.insert(1, "AverageMseLowDiff", [a[2], a[3]])
dfMse.insert(2, "AverageMseMediumDiff", [b[2], b[3]])
dfMse.insert(3, "AverageMseHighDiff", [c[2], c[3]])
dfMse.insert(4, "OverallAverageMse", [d[2], d[3]])
dfMse

## Conclusion
The algebraic (alg) function has a much higher success rate than the generalized logistic(genLog), probably because of the fewer parameters. Thus, the slight advantage of the genLog over the algebraic for the mean squared error does not justify using genLog instead of alg. However, the messy nature of the data does not allow us to decide which model would work better with more ideal distributions.