## Notebook Objective: Analyze the Most Popular Beatmaps and their Attributes

In [None]:
import sys
sys.path.append('../..')
from pymongo import UpdateOne
import pandas as pd
import matplotlib.pyplot as plt
from exploration.config import mongo_inst
from pymongo import MongoClient
import seaborn as sns
from mlpp.data_collection.sample import ScoresSubset, get_more_recent_than

In [None]:
client = MongoClient('localhost', 27017)
db = client.osu_random_db
data = mongo_inst["osu_random_db"]

osu_subset = ScoresSubset(data['osu_scores_high'], data['osu_user_stats'])

In [None]:
new_subset, user_ids = osu_subset.init_random_sample(data['sample_scores_1M'], data['sample_users_1M'])
#creating a new collection of 1 million scores

In [None]:
collection = data["sample_scores_1M"]
get_more_recent_than(collection, 2018, 12, 1, 0, 0, 0, "sample_scores_500k")
#got 500k sample scores more recent than 12/1/2018 (last 2 years)
#get_more_recent_than function is in the file Sample.py

In [None]:
collection2 = data["sample_scores_500k"]

In [None]:
"""
max est user pp in collection of 500k
"""
cursor=db.sample_scores_500k.aggregate(
   [
     {
       "$group":
         {
           "_id": {},
           "max": { "$max": "$mlpp.est_user_pp" }
         }
     }
   ]
)
for document in cursor:
    print(document)
print(document['max'])
max_pp = document['max']

## Objective 1: Create a "Uniform" Collection 

In [None]:
a = 0
b = 100

while b <= max_pp:
    db.uniform_collection2.insert_many(
        collection2.aggregate([
        {
        '$match': {
            'mlpp.est_user_pp' : {
                '$gt': a,
                '$lt': b,
            }
        }
    },
    {'$sample': {
        'size': 800
    }
}
        
])
    )
    a = b
    b += 100
# creates a new "uniform" collection

## Objective 2: Create a Collection of the 1000 Most Popular Beatmaps

In [None]:
#once you have a uniform collection, use this to have a collection with the most 1000 popular maps:
db.uniform_collection2.aggregate([
    {
        '$group': {
            '_id': '$beatmap_id', 
            'count': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }, {
        '$limit': 1000
    }, {
        '$out': 'oneThousand_most_popular_maps2'
    }
])

## Objective 3: Relationship as Score Count Decreases

In [None]:
g=[]
h=[]
x=db.oneThousand_most_popular_maps2
x1=x.find({},{ "_id": 0, "count": 1}) #finds only the count column in the collection
y1=x.find({},{ "_id": 1, "count": 0})

# for i in x.find({},{ "_id": 0, "count": 1}):
#     print(i)

for i in x1:
    g.append(i['count'])
for i in x.find({},{ "_id": 1, "count": 0}):
    h.append(i['_id'])

plt.plot(g,'ro') #automatically index x 
plt.xlabel('index')
plt.ylabel('count')
plt.title('Relationship as Score Count Decreases')

#Score count seems to decrease exponentially

## Objective 4 & 6: Feature Distribution Analysis

In [None]:
collection = data["osu_beatmap_attribs"]

In [None]:
db.osu_beatmaps_attribs_modZero.insert_many(
    collection.aggregate([
    {
        '$match': {
            'mods': 0
            }
    }
]))

#creating a collection from the beatmap_attrib with only documents containing mod zero

In [None]:
collection = data['oneThousand_most_popular_maps2']

In [None]:
cursor = collection.find({},{"_id":1})
l = []
for el in cursor:
    l.append(el)
listOfIds = []
for i in range(1000):
    listOfIds.append(l[i]['_id'])
#list of id's among the 1k most popular beatmaps

In [None]:
collection2 = data['osu_beatmap_attribs']

In [None]:
for _id in listOfIds:
    db.oneThousand_Beatmaps_attribs.insert_many(
        collection2.aggregate([
                {'$match' : {
                    "_id" : _id }
                }
            ] ))
#from the beatmap attrib collection, creating a new collection with beatmap attrib for the 1k most popular maps

In [None]:
collection = data["osu_beatmaps_attribs_modZero"]

Distribution for attribute 5

In [None]:
db.attrib_5.insert_many(
        collection.aggregate([
        {'$match':{
        "beatmap_id": {
            '$in': listOfIds
        },
        "attrib_id": 5,
    } 
    }
])
    )

In [None]:
c1=[]
d=[]
t=db.attrib_5
x=t.find({},{ "_id": 1, "count": 0})
y=t.find({},{ "_id": 0, "value": 1})

for i in y:
    d.append(i['value'])
for i in x:
    c1.append(i["beatmap_id"])

plt.hist(d)
plt.show()

Distribution for attribute 17

In [None]:
db.attrib_17.insert_many(
    collection.aggregate([
    {'$match':{
    "beatmap_id": {
        '$in': listOfIds
    },
    "attrib_id": 17,
    } 
    }
])
    )

In [None]:
j=[]
k=[]
t=db.attrib_17
x=t.find({},{ "_id": 1, "count": 0})
y=t.find({},{ "_id": 0, "value": 1})

for i in y:
    k.append(i['value'])
for i in x:
    j.append(i["beatmap_id"])
    
k
plt.hist(k)
plt.show()

Distribution for attribute 1

In [None]:
db.attrib_1.insert_many(
    collection.aggregate([
    {'$match':{
    "beatmap_id": {
        '$in': listOfIds
    },
    "attrib_id": 1,
    } 
    }
])
    )

In [None]:
j=[]
k=[]
t=db.attrib_1
x=t.find({},{ "_id": 1, "count": 0})
y=t.find({},{ "_id": 0, "value": 1})

for i in y:
    k.append(i['value'])
for i in x:
    j.append(i["beatmap_id"])
    
k
plt.hist(k)
plt.show()

Distribution for attribute 3

In [None]:
db.attrib_3.insert_many(
    collection.aggregate([
    {'$match':{
    "beatmap_id": {
        '$in': listOfIds
    },
    "attrib_id": 3,
    } 
    }
])
    )

In [None]:
j=[]
k=[]
t=db.attrib_3
x=t.find({},{ "_id": 1, "count": 0})
y=t.find({},{ "_id": 0, "value": 1})

for i in y:
    k.append(i['value'])
for i in x:
    j.append(i["beatmap_id"])
    
k
plt.hist(k)
plt.show()

Distribution for Attribute 7

In [None]:
db.attrib_7.insert_many(
    collection.aggregate([
    {'$match':{
    "beatmap_id": {
        '$in': listOfIds
    },
    "attrib_id": 7,
    } 
    }
])
    )

In [None]:
j=[]
k=[]
t=db.attrib_7
x=t.find({},{ "_id": 1, "count": 0})
y=t.find({},{ "_id": 0, "value": 1})

for i in y:
    k.append(i['value'])
for i in x:
    j.append(i["beatmap_id"])
    
k
plt.hist(k)
plt.show()

Distribution for attribute 9

In [None]:
db.attrib_9.insert_many(
    collection.aggregate([
    {'$match':{
    "beatmap_id": {
        '$in': listOfIds
    },
    "attrib_id": 9,
    } 
    }
])
    )

In [None]:
j=[]
k=[]
t=db.attrib_9
x=t.find({},{ "_id": 1, "count": 0})
y=t.find({},{ "_id": 0, "value": 1})

for i in y:
    k.append(i['value'])
for i in x:
    j.append(i["beatmap_id"])
    
k
plt.hist(k)
plt.show()

Distribution for attribute 11

In [None]:
db.attrib_11.insert_many(
    collection.aggregate([
    {'$match':{
    "beatmap_id": {
        '$in': listOfIds
    },
    "attrib_id": 11,
    } 
    }
])
    )

In [None]:
j=[]
k=[]
t=db.attrib_11
x=t.find({},{ "_id": 1, "count": 0})
y=t.find({},{ "_id": 0, "value": 1})

for i in y:
    k.append(i['value'])
for i in x:
    j.append(i["beatmap_id"])
    
k
plt.hist(k)
plt.show()

## Objective 5: Star/OD Correlation with Popularity of Beatmap

In [None]:
df = pd.DataFrame(list(db.oneThousand_most_popular_maps2.find({})))
df.sort_values(["_id"], inplace = True)
df.reset_index(inplace = True)
a = df["count"]

In [None]:
df1 = pd.DataFrame()
df1["beatmap_id"] = j
df1['Star Difficulty'] = k
df1["count"] = a
df1.sort_values(by = ["count"], axis = 0, ascending = False, inplace = True)
c = df1["count"]
sd = df1["Star Difficulty"]
df1

In [None]:
plt.scatter(c, sd)

In [None]:
df2 = pd.DataFrame()
df2["beatmap_id"] = c1
df2['OD'] = d
df2["count"] = a
df2

#recall d is the list of OD values 

In [None]:
c2 = df2["count"]
od = df2["OD"]

In [None]:
plt.scatter(c2, od)

## Objective 7: Heatmap of Correlation Between Attributes

In [None]:
df3 = pd.DataFrame(list(db.osu_beatmaps_attribs_modZero.find({})))
df3.drop(["_id","mods"], axis = 1, inplace = True)

In [None]:
df3.head(20)

In [None]:
col = df3['attrib_id'].unique()
ind = df3['beatmap_id'].unique()
DF = pd.DataFrame(columns=col, index=ind)


g=df3.groupby(['beatmap_id', 'attrib_id'])

for name, group in g:
    bmap = name[0]
    attr = name[1]
    val = float(group['value'])
    DF.at[bmap, attr] = val


DF.head()

In [None]:
DF.columns = ["Aim", "Speed", "OD", "AR", "Max_Combo", "Strain", "Star Difficulty"]

In [None]:
DF.reset_index()

In [None]:
correlation = DF.astype('float64').corr()
f, ax = plt.subplots(figsize = (14, 12))
plt.title("Correlation of Attributes")
sns.heatmap(correlation, annot = True)
plt.show()



In [None]:
# collections = db.list_collection_names() 
# print ("collections:", collections, "\n")

#All of the collection in the database

## Objective 8: Conclusion

In [None]:
# Attribute AR is left skewed
# Attrib Max Combo is right skewed
# Aim and Star Difficulty, Strain and Star Dificulty are highly correlated 
# Speed and OD, AR and OD are highly correlated 
# Players tend 