In [1]:
import pandas as pd
from pymongo import MongoClient
import tiktoken

In [2]:
def _connect_mongo(host, port, username, password, db):
    """ A util for making a connection to mongo """

    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)


    return conn[db]

In [56]:
def read_mongo(db, collection, host='localhost', port=27017, username=None, password=None, no_id=True):
    """ Read from Mongo and Store into DataFrame """

    # Connect to MongoDB
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)

    # Make a query to the specific DB and Collection
    cursor = db[collection].find({'job_source':'gumtree'},{'position':1,'location':1,'job_source':1,'Details':1,'Ad_id':1,'Recruiter':1,'_id':0})

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id

    return df

In [60]:
data_fram=read_mongo("job_scraper","jobs")
data=data_fram[1000:1200]
data["Recruiter"].fillna("No recuriter details",inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 1000 to 1199
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   position    200 non-null    object
 1   location    200 non-null    object
 2   Recruiter   200 non-null    object
 3   Ad_id       200 non-null    object
 4   job_source  200 non-null    object
 5   Details     200 non-null    object
dtypes: object(6)
memory usage: 9.5+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["Recruiter"].fillna("No recuriter details",inplace=True)


In [63]:
data.head(1)

Unnamed: 0,position,location,Recruiter,Ad_id,job_source,Details
1000,BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LA...,"Buckingham, Buckinghamshire",The Graham Agency,5412070110,gumtree,"PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT..."


In [64]:
data["text"]=" Job Position:"+data.position + "; Recruiter Company: " + data["Recruiter"] +"; Job Location:"+ data["location"] +"; Job Description: "+data["Details"] +"Job Source:"+ data["job_source"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text"]=" Job Position:"+data.position + "; Recruiter Company: " + data["Recruiter"] +"; Job Location:"+ data["location"] +"; Job Description: "+data["Details"] +"Job Source:"+ data["job_source"]


In [66]:
data.head()
pd.set_option('display.max_colwidth', 500)
data["text"].head(10)

1000     Job Position:BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LADY - HK/COOK/DRIVER/OUTDOOR DUTIES/DOG FRIENDLY; Recruiter Company: The Graham Agency; Job Location:Buckingham, Buckinghamshire; Job Description: PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT STUDYING AND 3 DOGS REQUIRE EITHER  ONE EXPERIENCED HOUSEKEEPER/DRIVER/DINNER PARTY COOK TO BE RESPONSIBLE FOR THE DAY TO DAY RUNNING OF THE HOME TO INCLUDE,LAUNDRY/IRONING/RUNNING ERRANDS/SHOPPING/COOKING OR  A COUPLE- HOUSEKEEPING DUTIES AS ...
1001     Job Position:Event Rigger - Exhibitions/ Festivals (van driver); Recruiter Company: No recuriter details; Job Location:Northampton, Northamptonshire; Job Description: We are seeking a van driver/rigger.  We have a full time position available in our logistics and operations team. You would need to be able to drive a van and be capable of setting up and breaking down exhibitions and events.  The role includes working weekends, generally Thursday - Monday, Saturdays and Sundays ar

In [67]:
def remove_newlines(serie):
    serie = serie.str.replace('\n', ' ', regex=False)
    serie = serie.str.replace('\\n', ' ', regex=False)
    serie = serie.str.replace('  ',' ', regex=False)
    serie = serie.str.replace('  ',' ', regex=False)
    return serie

In [68]:
data['text'] = remove_newlines(data.text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text'] = remove_newlines(data.text)


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 1000 to 1199
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   position    200 non-null    object
 1   location    200 non-null    object
 2   Recruiter   200 non-null    object
 3   Ad_id       200 non-null    object
 4   job_source  200 non-null    object
 5   Details     200 non-null    object
 6   text        200 non-null    object
dtypes: object(7)
memory usage: 11.1+ KB


In [70]:
from transformers import GPT2TokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [71]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
data['n_tokens'] = data.text.apply(lambda x: len(tokenizer.encode(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['n_tokens'] = data.text.apply(lambda x: len(tokenizer.encode(x)))


In [73]:
data.head(5)

Unnamed: 0,position,location,Recruiter,Ad_id,job_source,Details,text,n_tokens
1000,BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LADY - HK/COOK/DRIVER/OUTDOOR DUTIES/DOG FRIENDLY,"Buckingham, Buckinghamshire",The Graham Agency,5412070110,gumtree,"PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT STUDYING AND 3 DOGS REQUIRE EITHER ONE EXPERIENCED HOUSEKEEPER/DRIVER/DINNER PARTY COOK TO BE RESPONSIBLE FOR THE DAY TO DAY RUNNING OF THE HOME TO INCLUDE,LAUNDRY/IRONING/RUNNING ERRANDS/SHOPPING/COOKING OR A COUPLE- HOUSEKEEPING DUTIES AS ABOVE - PARTNER WILL BE RESPONSIBLE FOR ALL EXTERNAL DUTIES, TO INCLUDE SOME GARDENING/DRIVING/MAINTAINING THE CARS TO A CLEAN STANDARD /LIAISE WITH TRADES PERSONS/CONTRACTORS THIS IS A 5 DAY WEEK POSITION...","Job Position:BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LADY - HK/COOK/DRIVER/OUTDOOR DUTIES/DOG FRIENDLY; Recruiter Company: The Graham Agency; Job Location:Buckingham, Buckinghamshire; Job Description: PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT STUDYING AND 3 DOGS REQUIRE EITHER ONE EXPERIENCED HOUSEKEEPER/DRIVER/DINNER PARTY COOK TO BE RESPONSIBLE FOR THE DAY TO DAY RUNNING OF THE HOME TO INCLUDE,LAUNDRY/IRONING/RUNNING ERRANDS/SHOPPING/COOKING OR A COUPLE- HOUSEKEEPING DUTIES AS AB...",319
1001,Event Rigger - Exhibitions/ Festivals (van driver),"Northampton, Northamptonshire",No recuriter details,5412107513,gumtree,"We are seeking a van driver/rigger. We have a full time position available in our logistics and operations team. You would need to be able to drive a van and be capable of setting up and breaking down exhibitions and events. The role includes working weekends, generally Thursday - Monday, Saturdays and Sundays are considered working days every week (except when booked as holiday). However, we will require some flexibility with regards to these days as events can differ week to week. The i...","Job Position:Event Rigger - Exhibitions/ Festivals (van driver); Recruiter Company: No recuriter details; Job Location:Northampton, Northamptonshire; Job Description: We are seeking a van driver/rigger. We have a full time position available in our logistics and operations team. You would need to be able to drive a van and be capable of setting up and breaking down exhibitions and events. The role includes working weekends, generally Thursday - Monday, Saturdays and Sundays are considered w...",385
1002,Business Development Manager (Part Time-Field Job),United Kingdom,No recuriter details,5412123273,gumtree,"Company Overview: We are a leading company based in the USA, specializing in partnering and venturing with companies across various industries. With our expertise in establishing joint ventures and partnerships, we aim to facilitate the growth and expansion of businesses in the United States. We are now expanding our operations to the UK and seeking a skilled and dynamic individual to join our team as a Business Development Manager. Position: Business Development Manager Location: United ...","Job Position:Business Development Manager (Part Time-Field Job); Recruiter Company: No recuriter details; Job Location:United Kingdom; Job Description: Company Overview: We are a leading company based in the USA, specializing in partnering and venturing with companies across various industries. With our expertise in establishing joint ventures and partnerships, we aim to facilitate the growth and expansion of businesses in the United States. We are now expanding our operations to the UK and...",618
1003,Maintenance Engineer / Multi Trader / Highly Skilled,"Clapham Common, London",No recuriter details,5412121773,gumtree,"TITLE: Maintenance Engineer / Multi Trader / Highly Skilled LOCATION: SW4 6DH, London, United Kingdom TERMS: Self-employed SALARY: £36,000 - £45,500 / £150 - £190 Per Day / All Expenses Paid BENEFITS: Parking & Petrol Paid For Company Card For Materials Brand New Van Career Progression Recognition For Excellence Join a Growing Business ABOUT US: Optimal Maintenance Ltd offers property maintenance to a variety of clients from property management companies to high-end estate agents...","Job Position:Maintenance Engineer / Multi Trader / Highly Skilled; Recruiter Company: No recuriter details; Job Location:Clapham Common, London; Job Description: TITLE: Maintenance Engineer / Multi Trader / Highly Skilled LOCATION: SW4 6DH, London, United Kingdom TERMS: Self-employed SALARY: £36,000 - £45,500 / £150 - £190 Per Day / All Expenses Paid BENEFITS: Parking & Petrol Paid For Company Card For Materials Brand New Van Career Progression Recognition For Excellence Join a Growing Busi...",671
1004,"Associate Dentist (Maternity Cover) - North Ayrshire, 30 minutes from Glasgow","Stevenston, North Ayrshire",Clyde Dental Practice Limited,5412047263,gumtree,"We are recruiting a part-time Associate Dentist to join our team at Three Towns Stevenston Dental Care. This will be on a fixed term contract basis to cover maternity leave from July 2023, a permanent opportunity may follow. You will be working Monday, Tuesday and Thursday in a two surgery practice located in the centre of town with free parking available. What’s on offer with Clyde Munro Dental: Dedicated Clinical Support team to help develop your career and grow your income Scotland’s on...","Job Position:Associate Dentist (Maternity Cover) - North Ayrshire, 30 minutes from Glasgow; Recruiter Company: Clyde Dental Practice Limited; Job Location:Stevenston, North Ayrshire; Job Description: We are recruiting a part-time Associate Dentist to join our team at Three Towns Stevenston Dental Care. This will be on a fixed term contract basis to cover maternity leave from July 2023, a permanent opportunity may follow. You will be working Monday, Tuesday and Thursday in a two surgery prac...",375


In [74]:
pd.set_option('display.max_colwidth', 500)

data["text"].head()

1000     Job Position:BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LADY - HK/COOK/DRIVER/OUTDOOR DUTIES/DOG FRIENDLY; Recruiter Company: The Graham Agency; Job Location:Buckingham, Buckinghamshire; Job Description: PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT STUDYING AND 3 DOGS REQUIRE EITHER ONE EXPERIENCED HOUSEKEEPER/DRIVER/DINNER PARTY COOK TO BE RESPONSIBLE FOR THE DAY TO DAY RUNNING OF THE HOME TO INCLUDE,LAUNDRY/IRONING/RUNNING ERRANDS/SHOPPING/COOKING OR A COUPLE- HOUSEKEEPING DUTIES AS AB...
1001     Job Position:Event Rigger - Exhibitions/ Festivals (van driver); Recruiter Company: No recuriter details; Job Location:Northampton, Northamptonshire; Job Description: We are seeking a van driver/rigger. We have a full time position available in our logistics and operations team. You would need to be able to drive a van and be capable of setting up and breaking down exhibitions and events. The role includes working weekends, generally Thursday - Monday, Saturdays and Sundays are 

In [75]:
data.to_csv("gumtree_jobs_data.csv")

In [76]:
import openai
openai.api_key="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"
data['embeddings']=data.text.apply(lambda x:openai.Embedding.create(input=x,engine='text-embedding-ada-002')['data'][0]['embedding'])
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['embeddings']=data.text.apply(lambda x:openai.Embedding.create(input=x,engine='text-embedding-ada-002')['data'][0]['embedding'])


Unnamed: 0,position,location,Recruiter,Ad_id,job_source,Details,text,n_tokens,embeddings
1000,BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LADY - HK/COOK/DRIVER/OUTDOOR DUTIES/DOG FRIENDLY,"Buckingham, Buckinghamshire",The Graham Agency,5412070110,gumtree,"PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT STUDYING AND 3 DOGS REQUIRE EITHER ONE EXPERIENCED HOUSEKEEPER/DRIVER/DINNER PARTY COOK TO BE RESPONSIBLE FOR THE DAY TO DAY RUNNING OF THE HOME TO INCLUDE,LAUNDRY/IRONING/RUNNING ERRANDS/SHOPPING/COOKING OR A COUPLE- HOUSEKEEPING DUTIES AS ABOVE - PARTNER WILL BE RESPONSIBLE FOR ALL EXTERNAL DUTIES, TO INCLUDE SOME GARDENING/DRIVING/MAINTAINING THE CARS TO A CLEAN STANDARD /LIAISE WITH TRADES PERSONS/CONTRACTORS THIS IS A 5 DAY WEEK POSITION...","Job Position:BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LADY - HK/COOK/DRIVER/OUTDOOR DUTIES/DOG FRIENDLY; Recruiter Company: The Graham Agency; Job Location:Buckingham, Buckinghamshire; Job Description: PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT STUDYING AND 3 DOGS REQUIRE EITHER ONE EXPERIENCED HOUSEKEEPER/DRIVER/DINNER PARTY COOK TO BE RESPONSIBLE FOR THE DAY TO DAY RUNNING OF THE HOME TO INCLUDE,LAUNDRY/IRONING/RUNNING ERRANDS/SHOPPING/COOKING OR A COUPLE- HOUSEKEEPING DUTIES AS AB...",319,"[-0.03147120773792267, -0.0004883953370153904, 0.013917387463152409, -0.0362306609749794, -0.001151426462456584, 0.03687238693237305, -0.01906454749405384, 0.0059326039627194405, -0.012694100849330425, -0.037514109164476395, -0.01625700481235981, -0.0026738494634628296, 0.004893144592642784, -0.003927216399461031, 0.0009258204372599721, 0.000701885495800525, 0.017674146220088005, 0.013636632822453976, -0.003449265845119953, -0.020602010190486908, -0.022166213020682335, 0.020521795377135277, ..."
1001,Event Rigger - Exhibitions/ Festivals (van driver),"Northampton, Northamptonshire",No recuriter details,5412107513,gumtree,"We are seeking a van driver/rigger. We have a full time position available in our logistics and operations team. You would need to be able to drive a van and be capable of setting up and breaking down exhibitions and events. The role includes working weekends, generally Thursday - Monday, Saturdays and Sundays are considered working days every week (except when booked as holiday). However, we will require some flexibility with regards to these days as events can differ week to week. The i...","Job Position:Event Rigger - Exhibitions/ Festivals (van driver); Recruiter Company: No recuriter details; Job Location:Northampton, Northamptonshire; Job Description: We are seeking a van driver/rigger. We have a full time position available in our logistics and operations team. You would need to be able to drive a van and be capable of setting up and breaking down exhibitions and events. The role includes working weekends, generally Thursday - Monday, Saturdays and Sundays are considered w...",385,"[-0.0063501084223389626, -0.017786970362067223, -0.0029967178124934435, -0.01518692634999752, -0.023840406909585, 0.03181387856602669, -0.026453785598278046, -0.016666950657963753, -0.0031100530177354813, -0.012706884182989597, -0.00288504920899868, -0.002993384376168251, -0.005720097571611404, 0.0004925084067508578, 0.0011258525773882866, -0.012900220230221748, 0.03373390808701515, 0.006353442091494799, -0.022827057167887688, -0.03178720921278, 0.0037467307411134243, -0.0026633788365870714,..."
1002,Business Development Manager (Part Time-Field Job),United Kingdom,No recuriter details,5412123273,gumtree,"Company Overview: We are a leading company based in the USA, specializing in partnering and venturing with companies across various industries. With our expertise in establishing joint ventures and partnerships, we aim to facilitate the growth and expansion of businesses in the United States. We are now expanding our operations to the UK and seeking a skilled and dynamic individual to join our team as a Business Development Manager. Position: Business Development Manager Location: United ...","Job Position:Business Development Manager (Part Time-Field Job); Recruiter Company: No recuriter details; Job Location:United Kingdom; Job Description: Company Overview: We are a leading company based in the USA, specializing in partnering and venturing with companies across various industries. With our expertise in establishing joint ventures and partnerships, we aim to facilitate the growth and expansion of businesses in the United States. We are now expanding our operations to the UK and...",618,"[-0.033895343542099, -0.025641102343797684, 0.004543706774711609, -0.025163158774375916, -0.02694576419889927, 0.03779640793800354, -0.01206488162279129, -0.007298349402844906, 0.004220771137624979, -0.014829212799668312, -0.0035490645095705986, 0.003050128696486354, 0.00839633122086525, -0.01183236762881279, 0.0019699083641171455, -0.01241365261375904, 0.014247927814722061, -0.014764625579118729, -0.0022524772211909294, -0.028211671859025955, -0.00807985384017229, -0.0012990093091502786, 0...."
1003,Maintenance Engineer / Multi Trader / Highly Skilled,"Clapham Common, London",No recuriter details,5412121773,gumtree,"TITLE: Maintenance Engineer / Multi Trader / Highly Skilled LOCATION: SW4 6DH, London, United Kingdom TERMS: Self-employed SALARY: £36,000 - £45,500 / £150 - £190 Per Day / All Expenses Paid BENEFITS: Parking & Petrol Paid For Company Card For Materials Brand New Van Career Progression Recognition For Excellence Join a Growing Business ABOUT US: Optimal Maintenance Ltd offers property maintenance to a variety of clients from property management companies to high-end estate agents...","Job Position:Maintenance Engineer / Multi Trader / Highly Skilled; Recruiter Company: No recuriter details; Job Location:Clapham Common, London; Job Description: TITLE: Maintenance Engineer / Multi Trader / Highly Skilled LOCATION: SW4 6DH, London, United Kingdom TERMS: Self-employed SALARY: £36,000 - £45,500 / £150 - £190 Per Day / All Expenses Paid BENEFITS: Parking & Petrol Paid For Company Card For Materials Brand New Van Career Progression Recognition For Excellence Join a Growing Busi...",671,"[-0.017187179997563362, -0.006330160424113274, 0.004398294258862734, -0.0339142307639122, -0.021301276981830597, 0.036269012838602066, 0.0020570484921336174, 0.0027912252116948366, -0.01828337088227272, -0.014020408503711224, -0.01173329632729292, 0.014318139292299747, -0.011503231711685658, 0.007463566958904266, 0.004208829253911972, -0.018351037055253983, 0.025428907945752144, -0.017430778592824936, -0.011760362423956394, -0.0238590557128191, -0.029962534084916115, 0.025821371003985405, -0..."
1004,"Associate Dentist (Maternity Cover) - North Ayrshire, 30 minutes from Glasgow","Stevenston, North Ayrshire",Clyde Dental Practice Limited,5412047263,gumtree,"We are recruiting a part-time Associate Dentist to join our team at Three Towns Stevenston Dental Care. This will be on a fixed term contract basis to cover maternity leave from July 2023, a permanent opportunity may follow. You will be working Monday, Tuesday and Thursday in a two surgery practice located in the centre of town with free parking available. What’s on offer with Clyde Munro Dental: Dedicated Clinical Support team to help develop your career and grow your income Scotland’s on...","Job Position:Associate Dentist (Maternity Cover) - North Ayrshire, 30 minutes from Glasgow; Recruiter Company: Clyde Dental Practice Limited; Job Location:Stevenston, North Ayrshire; Job Description: We are recruiting a part-time Associate Dentist to join our team at Three Towns Stevenston Dental Care. This will be on a fixed term contract basis to cover maternity leave from July 2023, a permanent opportunity may follow. You will be working Monday, Tuesday and Thursday in a two surgery prac...",375,"[-0.00029870541766285896, 0.018027016893029213, 0.022002002224326134, -0.026548638939857483, -0.044801659882068634, 0.04873675853013992, -0.01797384023666382, 0.009086627513170242, -0.00822249986231327, -0.011326709762215614, 0.00657068844884634, 0.004304016940295696, -0.005560324527323246, -0.006537452805787325, -0.01410521101206541, -0.011246944777667522, 0.011027589440345764, -0.008973625488579273, -0.0058129155077040195, -0.005806268192827702, -0.04134514927864075, -0.011858480982482433,..."


In [77]:
data.to_parquet('gumtree_jobs_embeddings.parquet',engine='fastparquet')

In [79]:
df=pd.read_parquet('gumtree_jobs_embeddings.parquet')
df.head()

Unnamed: 0,position,location,Recruiter,Ad_id,job_source,Details,text,n_tokens,embeddings
1000,BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LADY - HK/COOK/DRIVER/OUTDOOR DUTIES/DOG FRIENDLY,"Buckingham, Buckinghamshire",The Graham Agency,5412070110,gumtree,"PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT STUDYING AND 3 DOGS REQUIRE EITHER ONE EXPERIENCED HOUSEKEEPER/DRIVER/DINNER PARTY COOK TO BE RESPONSIBLE FOR THE DAY TO DAY RUNNING OF THE HOME TO INCLUDE,LAUNDRY/IRONING/RUNNING ERRANDS/SHOPPING/COOKING OR A COUPLE- HOUSEKEEPING DUTIES AS ABOVE - PARTNER WILL BE RESPONSIBLE FOR ALL EXTERNAL DUTIES, TO INCLUDE SOME GARDENING/DRIVING/MAINTAINING THE CARS TO A CLEAN STANDARD /LIAISE WITH TRADES PERSONS/CONTRACTORS THIS IS A 5 DAY WEEK POSITION...","Job Position:BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LADY - HK/COOK/DRIVER/OUTDOOR DUTIES/DOG FRIENDLY; Recruiter Company: The Graham Agency; Job Location:Buckingham, Buckinghamshire; Job Description: PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT STUDYING AND 3 DOGS REQUIRE EITHER ONE EXPERIENCED HOUSEKEEPER/DRIVER/DINNER PARTY COOK TO BE RESPONSIBLE FOR THE DAY TO DAY RUNNING OF THE HOME TO INCLUDE,LAUNDRY/IRONING/RUNNING ERRANDS/SHOPPING/COOKING OR A COUPLE- HOUSEKEEPING DUTIES AS AB...",319,"[-0.03147120773792267, -0.0004883953370153904, 0.013917387463152409, -0.0362306609749794, -0.001151426462456584, 0.03687238693237305, -0.01906454749405384, 0.0059326039627194405, -0.012694100849330425, -0.037514109164476395, -0.01625700481235981, -0.0026738494634628296, 0.004893144592642784, -0.003927216399461031, 0.0009258204372599721, 0.000701885495800525, 0.017674146220088005, 0.013636632822453976, -0.003449265845119953, -0.020602010190486908, -0.022166213020682335, 0.020521795377135277, ..."
1001,Event Rigger - Exhibitions/ Festivals (van driver),"Northampton, Northamptonshire",No recuriter details,5412107513,gumtree,"We are seeking a van driver/rigger. We have a full time position available in our logistics and operations team. You would need to be able to drive a van and be capable of setting up and breaking down exhibitions and events. The role includes working weekends, generally Thursday - Monday, Saturdays and Sundays are considered working days every week (except when booked as holiday). However, we will require some flexibility with regards to these days as events can differ week to week. The i...","Job Position:Event Rigger - Exhibitions/ Festivals (van driver); Recruiter Company: No recuriter details; Job Location:Northampton, Northamptonshire; Job Description: We are seeking a van driver/rigger. We have a full time position available in our logistics and operations team. You would need to be able to drive a van and be capable of setting up and breaking down exhibitions and events. The role includes working weekends, generally Thursday - Monday, Saturdays and Sundays are considered w...",385,"[-0.0063501084223389626, -0.017786970362067223, -0.0029967178124934435, -0.01518692634999752, -0.023840406909585, 0.03181387856602669, -0.026453785598278046, -0.016666950657963753, -0.0031100530177354813, -0.012706884182989597, -0.00288504920899868, -0.002993384376168251, -0.005720097571611404, 0.0004925084067508578, 0.0011258525773882866, -0.012900220230221748, 0.03373390808701515, 0.006353442091494799, -0.022827057167887688, -0.03178720921278, 0.0037467307411134243, -0.0026633788365870714,..."
1002,Business Development Manager (Part Time-Field Job),United Kingdom,No recuriter details,5412123273,gumtree,"Company Overview: We are a leading company based in the USA, specializing in partnering and venturing with companies across various industries. With our expertise in establishing joint ventures and partnerships, we aim to facilitate the growth and expansion of businesses in the United States. We are now expanding our operations to the UK and seeking a skilled and dynamic individual to join our team as a Business Development Manager. Position: Business Development Manager Location: United ...","Job Position:Business Development Manager (Part Time-Field Job); Recruiter Company: No recuriter details; Job Location:United Kingdom; Job Description: Company Overview: We are a leading company based in the USA, specializing in partnering and venturing with companies across various industries. With our expertise in establishing joint ventures and partnerships, we aim to facilitate the growth and expansion of businesses in the United States. We are now expanding our operations to the UK and...",618,"[-0.033895343542099, -0.025641102343797684, 0.004543706774711609, -0.025163158774375916, -0.02694576419889927, 0.03779640793800354, -0.01206488162279129, -0.007298349402844906, 0.004220771137624979, -0.014829212799668312, -0.0035490645095705986, 0.003050128696486354, 0.00839633122086525, -0.01183236762881279, 0.0019699083641171455, -0.01241365261375904, 0.014247927814722061, -0.014764625579118729, -0.0022524772211909294, -0.028211671859025955, -0.00807985384017229, -0.0012990093091502786, 0...."
1003,Maintenance Engineer / Multi Trader / Highly Skilled,"Clapham Common, London",No recuriter details,5412121773,gumtree,"TITLE: Maintenance Engineer / Multi Trader / Highly Skilled LOCATION: SW4 6DH, London, United Kingdom TERMS: Self-employed SALARY: £36,000 - £45,500 / £150 - £190 Per Day / All Expenses Paid BENEFITS: Parking & Petrol Paid For Company Card For Materials Brand New Van Career Progression Recognition For Excellence Join a Growing Business ABOUT US: Optimal Maintenance Ltd offers property maintenance to a variety of clients from property management companies to high-end estate agents...","Job Position:Maintenance Engineer / Multi Trader / Highly Skilled; Recruiter Company: No recuriter details; Job Location:Clapham Common, London; Job Description: TITLE: Maintenance Engineer / Multi Trader / Highly Skilled LOCATION: SW4 6DH, London, United Kingdom TERMS: Self-employed SALARY: £36,000 - £45,500 / £150 - £190 Per Day / All Expenses Paid BENEFITS: Parking & Petrol Paid For Company Card For Materials Brand New Van Career Progression Recognition For Excellence Join a Growing Busi...",671,"[-0.017187179997563362, -0.006330160424113274, 0.004398294258862734, -0.0339142307639122, -0.021301276981830597, 0.036269012838602066, 0.0020570484921336174, 0.0027912252116948366, -0.01828337088227272, -0.014020408503711224, -0.01173329632729292, 0.014318139292299747, -0.011503231711685658, 0.007463566958904266, 0.004208829253911972, -0.018351037055253983, 0.025428907945752144, -0.017430778592824936, -0.011760362423956394, -0.0238590557128191, -0.029962534084916115, 0.025821371003985405, -0..."
1004,"Associate Dentist (Maternity Cover) - North Ayrshire, 30 minutes from Glasgow","Stevenston, North Ayrshire",Clyde Dental Practice Limited,5412047263,gumtree,"We are recruiting a part-time Associate Dentist to join our team at Three Towns Stevenston Dental Care. This will be on a fixed term contract basis to cover maternity leave from July 2023, a permanent opportunity may follow. You will be working Monday, Tuesday and Thursday in a two surgery practice located in the centre of town with free parking available. What’s on offer with Clyde Munro Dental: Dedicated Clinical Support team to help develop your career and grow your income Scotland’s on...","Job Position:Associate Dentist (Maternity Cover) - North Ayrshire, 30 minutes from Glasgow; Recruiter Company: Clyde Dental Practice Limited; Job Location:Stevenston, North Ayrshire; Job Description: We are recruiting a part-time Associate Dentist to join our team at Three Towns Stevenston Dental Care. This will be on a fixed term contract basis to cover maternity leave from July 2023, a permanent opportunity may follow. You will be working Monday, Tuesday and Thursday in a two surgery prac...",375,"[-0.00029870541766285896, 0.018027016893029213, 0.022002002224326134, -0.026548638939857483, -0.044801659882068634, 0.04873675853013992, -0.01797384023666382, 0.009086627513170242, -0.00822249986231327, -0.011326709762215614, 0.00657068844884634, 0.004304016940295696, -0.005560324527323246, -0.006537452805787325, -0.01410521101206541, -0.011246944777667522, 0.011027589440345764, -0.008973625488579273, -0.0058129155077040195, -0.005806268192827702, -0.04134514927864075, -0.011858480982482433,..."


In [80]:
from sys import getsizeof

too_big = []

for text in df['text'].tolist():
    if getsizeof(text) > 5000:
        too_big.append((text, getsizeof(text)))

print(f"{len(too_big)} / {len(df)} records are too big")

17 / 200 records are too big


In [81]:
import pinecone

pinecone.init(
    api_key='1dc22cc0-57e4-47c7-a364-3fd34c1ec992',  # app.pinecone.io
    environment='us-west4-gcp'
)


In [82]:
pd.reset_option("display.max_colwidth")

In [83]:
index_name='whitebox'
if not index_name in pinecone.list_indexes():
    pinecone.create_index(
        index_name, dimension=len(df['embeddings'].tolist()[0]),
        metric='cosine'
    )

index = pinecone.Index(index_name)

In [84]:
pinecone.Index(index_name)

<pinecone.index.Index at 0x7ff75b48d520>

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 1000 to 1199
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   position    200 non-null    object
 1   location    200 non-null    object
 2   Recruiter   200 non-null    object
 3   Ad_id       200 non-null    object
 4   job_source  200 non-null    object
 5   Details     200 non-null    object
 6   text        200 non-null    object
 7   n_tokens    200 non-null    int64 
 8   embeddings  200 non-null    object
dtypes: int64(1), object(8)
memory usage: 14.2+ KB


In [86]:
df['job_id'] = [str(i) for i in df["Ad_id"]]
df.head()

Unnamed: 0,position,location,Recruiter,Ad_id,job_source,Details,text,n_tokens,embeddings,job_id
1000,BUCKINGHAMSHIRE -LIVE IN - COUPLE OR SINGLE LA...,"Buckingham, Buckinghamshire",The Graham Agency,5412070110,gumtree,"PROFESSIONAL, BUSY FAMILY WITH ONE YOUNG ADULT...",Job Position:BUCKINGHAMSHIRE -LIVE IN - COUPL...,319,"[-0.03147120773792267, -0.0004883953370153904,...",5412070110
1001,Event Rigger - Exhibitions/ Festivals (van dri...,"Northampton, Northamptonshire",No recuriter details,5412107513,gumtree,We are seeking a van driver/rigger. We have a...,Job Position:Event Rigger - Exhibitions/ Fest...,385,"[-0.0063501084223389626, -0.017786970362067223...",5412107513
1002,Business Development Manager (Part Time-Field ...,United Kingdom,No recuriter details,5412123273,gumtree,Company Overview: We are a leading company ba...,Job Position:Business Development Manager (Pa...,618,"[-0.033895343542099, -0.025641102343797684, 0....",5412123273
1003,Maintenance Engineer / Multi Trader / Highly S...,"Clapham Common, London",No recuriter details,5412121773,gumtree,TITLE: Maintenance Engineer / Multi Trader / H...,Job Position:Maintenance Engineer / Multi Tra...,671,"[-0.017187179997563362, -0.006330160424113274,...",5412121773
1004,Associate Dentist (Maternity Cover) - North Ay...,"Stevenston, North Ayrshire",Clyde Dental Practice Limited,5412047263,gumtree,We are recruiting a part-time Associate Dentis...,Job Position:Associate Dentist (Maternity Cov...,375,"[-0.00029870541766285896, 0.018027016893029213...",5412047263


In [87]:
from tqdm.auto import tqdm

batch_size = 32

for i in tqdm(range(0, len(df), batch_size)):
    i_end = min(i+batch_size, len(df))
    df_slice = df.iloc[i:i_end]
    to_upsert = [
        (
            row['job_id'],
            row['embeddings'],
            {
                'position': row['position'],
                'company': row['Recruiter'],
                'location': row['location'],
                'job_source': row['job_source'],
                'details': row['text'],
                

                'n_tokens': row['n_tokens']
            }
        ) for _, row in df_slice.iterrows()
    ]
    index.upsert(vectors=to_upsert,namespace='jobs')

100%|██████████| 7/7 [00:06<00:00,  1.08it/s]
