Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated index.js #7

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 52 additions & 119 deletions scripts/indexer/index.js
Original file line number Diff line number Diff line change
@@ -1,120 +1,59 @@
require('dotenv').config();

const _ = require('lodash');
const fastJson = require('fast-json-stringify');
const stringify = fastJson({
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Akshat111111 Any reason you've removed fast-json-stringify?

That led to a significantly marked improvement in JSON parsing performance when I tried it with and without on this 32M recordset?

title: 'Song Schema',
type: 'object',
properties: {
track_id: {
type: 'string',
},
title: {
type: 'string',
},
album_name: {
type: 'string',
nullable: true,
},
primary_artist_name: {
type: 'string',
},
genres: {
type: 'array',
items: {
type: 'string',
},
},
country: {
type: 'string',
},
release_date: {
type: 'integer',
},
release_decade: {
type: 'string',
},
release_group_types: {
type: 'array',
items: {
type: 'string',
},
},
urls: {
type: 'array',
items: {
type: 'object',
properties: {
type: {
type: 'string',
},
url: {
type: 'string',
},
},
},
},
},
});
const { Client } = require('typesense');
const fs = require('fs').promises;
const readline = require('readline');

const BATCH_SIZE = process.env.BATCH_SIZE || 100;
const CHUNK_SIZE = process.env.CHUNK_SIZE || 3;
const MAX_LINES = process.env.MAX_LINES || Infinity;
const DATA_FILE = process.env.DATA_FILE || './scripts/data/1K-records.json';

const fs = require('fs');
const readline = require('readline');
const Typesense = require('typesense');

function extractUrls(parsedRecord) {
return parsedRecord['relations']
.filter(r =>
async function extractUrls(parsedRecord) {
return parsedRecord.relations
.filter((r) =>
[
'amazon asin',
'streaming',
'free streaming',
'download for free',
'purchase for download',
].includes(r['type'])
].includes(r.type)
)
.map(r => {
return { type: r['type'], url: r['url']['resource'] };
});
.map((r) => ({ type: r.type, url: r.url.resource }));
}

async function addSongsToTypesense(songs, typesense, collectionName) {
try {
const returnDataChunks = await Promise.all(
_.chunk(songs, Math.ceil(songs.length / CHUNK_SIZE)).map(songsChunk => {
const jsonlString = songsChunk.map(song => stringify(song)).join('\n');

return typesense
const returnDataChunks = await Promise.all(
songs
.reduce((chunks, song) => {
const jsonlString = chunks[chunks.length - 1].length >= CHUNK_SIZE
? chunks.push([]) && chunks[chunks.length - 1]
: chunks[chunks.length - 1];
jsonlString.push(stringify(song));
return chunks;
}, [[]])
.map((songsChunk) =>
typesense
.collections(collectionName)
.documents()
.import(jsonlString);
})
);

const failedItems = returnDataChunks
.map(returnData =>
returnData
.split('\n')
.map(r => JSON.parse(r))
.filter(item => item.success === false)
.import(songsChunk.join('\n'))
)
.flat();
if (failedItems.length > 0) {
throw new Error(
`Error indexing items ${JSON.stringify(failedItems, null, 2)}`
);
}
} catch (error) {
console.log(error);
);

const failedItems = returnDataChunks
.map((returnData) =>
returnData
.split('\n')
.map((r) => JSON.parse(r))
.filter((item) => item.success === false)
)
.flat();
if (failedItems.length > 0) {
throw new Error(`Error indexing items ${JSON.stringify(failedItems, null, 2)}`);
}
}

module.exports = (async () => {
const typesense = new Typesense.Client({
(async () => {
const typesense = new Client({
nodes: [
{
host: process.env.TYPESENSE_HOST,
Expand Down Expand Up @@ -151,7 +90,7 @@ module.exports = (async () => {

console.log('Adding records: ');

const fileStream = fs.createReadStream(DATA_FILE);
const fileStream = await fs.open(DATA_FILE, 'r');
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
Expand All @@ -164,34 +103,30 @@ module.exports = (async () => {
const parsedRecord = JSON.parse(line);
try {
songs.push(
...parsedRecord['media']
.map(media => media['tracks'])
...parsedRecord.media
.map((media) => media.tracks)
.flat()
.filter(track => track) // To remove nulls
.map(track => {
.filter((track) => track) // To remove nulls
.map((track) => {
const releaseDate =
Math.round(
Date.parse(
parsedRecord['release-group']['first-release-date']
) / 1000
Date.parse(parsedRecord['release-group']['first-release-date']) / 1000
) || 0;

// Be sure to update the schema passed to stringify when updating this structure
const song = {
track_id: track['id'],
title: track['title'],
album_name: parsedRecord['title'],
primary_artist_name:
parsedRecord['artist-credit'][0]['artist']['name'],
track_id: track.id,
title: track.title,
album_name: parsedRecord.title,
primary_artist_name: parsedRecord['artist-credit'][0].artist.name,
genres: [
...track['recording']['genres'].map(g => g.name),
...parsedRecord['genres'].map(g => g.name),
...parsedRecord['release-group']['genres'].map(g => g.name),
...track.recording.genres.map((g) => g.name),
...parsedRecord.genres.map((g) => g.name),
...parsedRecord['release-group'].genres.map((g) => g.name),
].map(
([firstChar, ...rest]) =>
firstChar.toUpperCase() + rest.join('').toLowerCase()
),
country: parsedRecord['country'] || 'Unknown',
country: parsedRecord.country || 'Unknown',
release_date: releaseDate,
release_decade: `${Math.round(
new Date(releaseDate * 1000).getUTCFullYear() / 10
Expand All @@ -201,11 +136,10 @@ module.exports = (async () => {
parsedRecord['release-group']['secondary-types'] || null,
]
.flat()
.filter(e => e),
.filter((e) => e),
urls: extractUrls(parsedRecord),
};
process.stdout.write('.');

return song;
})
);
Expand All @@ -217,7 +151,7 @@ module.exports = (async () => {

if (currentLine % BATCH_SIZE === 0) {
await addSongsToTypesense(songs, typesense, collectionName);
console.log(` Lines upto ${currentLine} ✅`);
console.log(` Lines up to ${currentLine} ✅`);
songs = [];
}

Expand All @@ -233,9 +167,7 @@ module.exports = (async () => {

let oldCollectionName;
try {
oldCollectionName = await typesense.aliases('s').retrieve()[
'collection_name'
];
oldCollectionName = (await typesense.aliases('s').retrieve())['collection_name'];
} catch (error) {
// Do nothing
}
Expand All @@ -252,3 +184,4 @@ module.exports = (async () => {
console.error(error);
}
})();