# analyzing important features in the `thingiverse` schema and their application to dataframes


this is a good demonstration of how openapi schema can be used in cooperation with dataframes/

In [1]:
%%
## thingiverse swagger docs

thingiverse has a `swagger` `schema` that provides critical information about the requests
and responses were are going to make. we can use the schema to know what should include and exclude
from analysis and engineering.

    swagger=(
https://www.thingiverse.com/swagger/docs/

    )

an benefit of schema is that they change slowly so it makes sense to cache our requests to be good citizens.
we use a `functools.lru_cache` for in-memory cache and `requests_cache` for on-disck caching.

    
    import requests, platformdirs, requests_cache, yaml
    requests_cache.install_cache(platformdirs.user_cache_dir("thingiverse", "scaper"))
    @(singleton := functools.lru_cache(1))
    def get_swagger(): return yaml.safe_load(requests.get(F"{swagger}openapi.yaml").text)

    def get_baseurl(url, root=False):
the urls in the schema need to be processed to extract proper uri to scrape,
`root is True` returns the baseuri of the `url`.

        
        base, ext, _ = url.rpartition(".yaml")
        if root: base, ext, _ = base.rpartition("/")
        return base + ext

In [29]:
%%
### the search schema

our analysis is specifically tailored to make a `search_request` which will return other schema references 
we unpack. this response contains important thing information, but lacks features that a more fine grained
thing request can provide.

    @singleton
    def get_search_schema():
        search_request = (swagger + get_swagger()["paths"]['/search/{term}/?type=things']["$ref"][2:])
        search_request_root = get_baseurl(search_request, True)
the `search` schema tells us the query parameters and `response_schema` that exist. 
    
        search = yaml.safe_load(requests.get(search_request).text)
        response_schema = search["ByTermThings"]["get"]["responses"]["200"]["content"]["application/json"]["schema"]
        return yaml.safe_load(requests.get(search_request_root + response_schema["properties"]["hits"]["items"]["$ref"]).text)

### the thing schema

    def get_thing_schema():
        thing_request = get_baseurl(swagger + get_swagger()["paths"]["/things/{thing_id}"]["$ref"][2:])
        thing_root = get_baseurl(thing_request, True)
        thing_schemas = yaml.safe_load(requests.get(thing_request).text)
        endpoint = thing_schemas["ById"]["get"]["responses"]["200"]["content"]["application/json"]["schema"]["$ref"]
        return yaml.safe_load(requests.get(thing_root + endpoint).text)

In [32]:
%%
## extracting type information

    FEATURES = "integer string url date-time".split()
    def shuffle_types(df: DataFrame) -> DataFrame:
our schema contains coarse json types and `shuffle_types` defines more specific types 
like those contained in `FEATURES`
        
        df.loc[df.index.str.endswith("_url"), "type"] = "url"
        df.loc[df.format.eq("date-time"), "type"] = "date-time"
        return df


    def get_columnar_features(df: DataFrame) -> DataFrame:
`get_columnar_features` filters out wide data features in the dataset.
        
        return df[df["type"].isin(FEATURES)]

## gather the actual schema

    search_properties = Series(get_search_schema()["properties"]).apply(Series).pipe(shuffle_types)
    thing_properties = Series(get_thing_schema()["properties"]).apply(Series).pipe(shuffle_types)

datetime columns need to handed with `pandas.to_datetime`


In [47]:
%%
### excluded features

in  our search technique we'll want a wide table that contains numerical and string information;
we won't search in nested data structures so we extract the `array and object` json schema types.

    search_integers_and_strings, thing_integers_and_strings = map(get_columnar_features, (search_properties, thing_properties))
    def excluded_features(parent, df): return parent[parent.index.map(complement(df.index.__contains__))]
{{search_properties.pipe(excluded_features, search_integers_and_strings).fillna("").style.set_caption(
    "excluded search features"
)._repr_html_()}}

{{thing_properties.pipe(excluded_features, thing_integers_and_strings).fillna("").style.set_caption(
    "excluded thing features"
)._repr_html_()}}

Unnamed: 0,type,format,example,$ref,minimum,maximum,nullable,items,properties
creator,,,,./user_summary_schema.yaml,,,,,
is_featured,boolean,,True,,,,,,
is_nsfw,boolean,,False,,,,True,,
is_ai,boolean,,False,,,,True,,
is_liked,boolean,,True,,,,,,
is_collected,boolean,,False,,,,,,
is_watched,boolean,,False,,,,,,
default_image,,,,./image_summary_schema.yaml,,,True,,
details_parts,array,,,,,,,{'type': 'object'},
edu_details_parts,array,,,,,,True,{'type': 'object'},

Unnamed: 0,type,format,example,$ref,minimum,maximum,nullable,items,properties
creator,,,,./user_summary_schema.yaml,,,,,
is_featured,boolean,,True,,,,,,
is_nsfw,boolean,,False,,,,True,,
is_ai,boolean,,False,,,,True,,
is_liked,boolean,,True,,,,,,
is_collected,boolean,,False,,,,,,
is_watched,boolean,,False,,,,,,
default_image,,,,./image_summary_schema.yaml,,,True,,
details_parts,array,,,,,,,{'type': 'object'},
edu_details_parts,array,,,,,,True,{'type': 'object'},


we can also figure out which columns are date-time
we can infer urls from the example