In [72]:
import csv
from __future__ import print_function
import pandas as pd 
import os
import re
import sys
import time
from bs4 import BeautifulSoup
from os.path import dirname, abspath
import requests
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pprint

def whole_prep_dataset(es_instance, my_index, query_type):
    try:  # Check status of ES server
        requests.get('http://localhost:9200')
    except:
        print('Elasticsearch service has not be stared, auto-exit now.')
        exit()
    # path is the parent dir of __file__'s location
    #path = dirname(dirname(abspath(__file__)))
    doc_path = 'spotify.csv'

    # create empty index,
    if es_instance.indices.exists(index=my_index):
        print('Index {} exists. removing it...'.format(my_index))
        es_instance.indices.delete(index=my_index)

    settings = {
        'index': {
            'store': {
                'type': 'default'
            },
            'max_result_window': 85000,
            'number_of_shards': 1,
            'number_of_replicas': 0
        },
        'analysis': {
            'analyzer': {
                'articles': {
                    'type': 'english'
                },
                'title':{
                    'type':'english'
                }
            }
        }
    }
    doc_mappings = {
        'properties': {
            'no': {
                'type': 'string',
                'store': True,
                'index': 'not_analyzed'
            },
            'country':{
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            },
            'uri':{
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            },
            'popularity':{
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            },
        
            'title':{
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            },    
            'artist':{
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            },
            'album':{
                 'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            },
            'date': {
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            },
            'genre': {
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            },
            'cluster': {
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            }
        }
    }
    query_mappings = {
        'properties': {
            'queryno': {
                'type': 'short',
                'store': True,
                'index': 'not_analyzed'
            },
            'sentence': {
                'type': 'string',
                'store': True,
                'index': 'analyzed',
                'term_vector': 'with_positions_offsets_payloads',
                'analyzer': 'articles'
            }
        }
    }
    mappings = {
        "dynamic": True,
        'document': doc_mappings,
        'query': query_mappings
    }
    print('Creating index {}...'.format(my_index))
    create_dataset(es_instance, my_index, settings, mappings)
    print('Loading documents...')
    load_docs(es_instance, my_index, doc_path)
    print('Dataset is all set.')



def create_dataset(es_instance, my_index, setting, mapping):
    try:
        es_instance.options(ignore_status=[400]).indices.create(
            index=my_index,
            mappings=mapping,
            settings=setting
        )            

    except:
        e = sys.exc_info()
        pprint.pprint('<p>Error: {}</p>'.format(e))



def parse_doc(doc_path):
            
    with open(doc_path, mode='r', encoding='utf-8-sig') as f:
        reader = csv.reader(f)        
        header = next(reader)
        count = 1
        for row in reader:
            no = count
            count+=1
            country = row[0]
            uri = row[1]
            popularity = float(row[2])
            title = row[3]
            artist = row[4]
            album = row[5]
            date = row[6]
            genre = row[7]
            cluster = row[8]
            #print(row[6])
            yield no,country,uri,popularity,title,artist,album,date,genre,cluster


def load_docs(es_instance, my_index, doc_path):
    action = ({
        '_index': my_index,
        '_source': {'no': no,'country':country,'uri':uri,'popularity':popularity, 'title':title, 'artist':artist,'album':album,'date': date, 'genre':genre,'cluster':cluster},
        '_id': no
    } for no,country,uri,popularity,title,artist,album,date,genre,cluster in parse_doc(doc_path))
    helpers.bulk(es_instance, action)
    # helpers.parallel_bulk(client=es_instance, actions=action, thread_count=4)


        
        
        
        

if __name__ == '__main__':
    start_time = time.time()
    es = Elasticsearch(['http://localhost:9200'],request_timeout=3600)  # 连接本地9200端口
    ap_index = 'spotify_dataset'
    doc_type = 'document'
    query_type = 'query'

    whole_prep_dataset(es, ap_index, query_type)
    print("--- {} seconds ---".format(time.time() - start_time))        
    
        

Index spotify_dataset exists. removing it...
Creating index spotify_dataset...
Loading documents...
Dataset is all set.
--- 4.835443019866943 seconds ---


In [73]:
import json


# body = {
#     "query":{
#       "bool": { 
# #         "must":     [{ "match": { "date": "2017" }}],
        
#         "should": [ 
         
#             { "match":
#                  { "title": {
#                     "query":"mars"
#                     , "boost": 2 }}},
#             { "match": { "date": "2017" }},
#             { "match":{"artist":"mars"}}
#         ] 
#         } 
#     }
# }

query={
  "query": {
    "function_score": {
      "query": {  
        "match": {
          "artist":"Ed Sheeran",
        }
      },
      "field_value_factor": {  
        "field": "popularity", 
        "modifier": "log1p", 
        "factor": 0.1
      }
    }
  }
}




es.search(index="spotify_dataset", body = query,size=20)

  es.search(index="spotify_dataset", body = query,size=20)


ObjectApiResponse({'took': 25, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 95, 'relation': 'eq'}, 'max_score': 58.560513, 'hits': [{'_index': 'spotify_dataset', '_id': '3070', '_score': 58.560513, '_source': {'no': 3070, 'country': 'Global', 'uri': 'https://open.spotify.com/track/7qiZfU4dY1lWllzX7mPBI3', 'popularity': 221571.0, 'title': 'shape of you', 'artist': 'Ed Sheeran', 'album': 'album', 'date': 'pop', 'genre': '71783101', 'cluster': 'FALSE'}}, {'_index': 'spotify_dataset', '_id': '1343', '_score': 49.539948, '_source': {'no': 1343, 'country': 'Global', 'uri': 'https://open.spotify.com/track/0afhq8XCExXpqazXczTSve', 'popularity': 47431.0, 'title': 'galway girl', 'artist': 'Ed Sheeran', 'album': 'album', 'date': 'pop', 'genre': '71783101', 'cluster': 'FALSE'}}, {'_index': 'spotify_dataset', '_id': '1764', '_score': 49.252357, '_source': {'no': 1764, 'country': 'Global', 'uri': 'https://open.spotify.com/track/