# Document Similarity and Clustering via Doc2Vec

In [1]:
import numpy as np
import gensim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
group_1 = "Computer scientist: A computer scientist is a professional who researches and analyzes how computers store and use information, develops new products or software, and solves practical computing problems. They may work as part of a research team with other IT professionals and engineers, or design efficient computer systems according to clients specifications and business functions. They also inspect and protect the security of the organization's database and manage software applications. They use algorithms, mathematics, physics, and creative thinking to study technological concepts, such as computational complexity theory, computer graphics, and programming languages. They work with different computer technologies and tools to improve and innovate existing and new ones. Some of the skills that a computer scientist needs include technical and mathematical skills, such as software development, hardware engineering, computer programming, linear algebra, discrete mathematics, calculus, and statistics. They also need technical writing skills to create technical manuals and document project data for others to interpret. Additionally, they need project management skills to lead IT teams when completing projects, communication skills to present theories and studies to the scientific community and educate other individuals in the field, and analytical skills to evaluate new computer systems or devices and conduct research experiments to test new theories."
group_2 = "Data scientist: A data scientist is a professional who collects, analyzes, and interprets large amounts of data from various sources to generate insights and solutions for business problems or scientific questions. They may work in different industries or sectors, such as healthcare, finance, education, or e-commerce. They use various methods and tools to process and manipulate data, such as machine learning, artificial intelligence, statistics, programming languages, databases, and visualization software. They also communicate their findings and recommendations to stakeholders or clients using reports, dashboards, or presentations .Some of the skills that a data scientist needs include technical skills such as Python, R, SQL, TensorFlow, Tableau, Excel etc., which are used for data analysis and manipulation. They also need mathematical and statistical skills to apply concepts such as linear algebra, probability theory, regression analysis, hypothesis testing etc., which are used for data modeling and inference. Additionally, they need business skills to understand the domain knowledge and the objectives of the projects, communication skills to explain complex data in simple terms and persuade others with data-driven arguments, and problem-solving skills to identify relevant data sources and design appropriate solutions for the challenges." 
group_3 = "Security guard: A security guard is a professional who protects people and property from theft, vandalism, fire, or other hazards. They may work in various settings, such as malls, offices, schools, or museums. They monitor the premises using surveillance cameras or patrol on foot or by vehicle. They also enforce rules and regulations and report any suspicious or illegal activities to the authorities or management. Some of the skills that a security guard needs include physical skills such as strength, stamina, agility, and self-defense, which are used for deterring or responding to threats. They also need observation skills to spot any signs of danger or disturbance and record details of incidents. Additionally, they need communication skills to interact with the public and provide assistance or information when needed, and teamwork skills to coordinate with other security personnel or law enforcement officers when necessary."
example_resume_text = """
- Solutions-oriented computer scientist with vast experience working across all software development phases.
- Adept in identifying and solving complex computer system and software issues.
- Proficient in Python, Java, C++, SQL, TensorFlow, PyTorch and AWS.
- Seeking to leverage my skills and experience to benefit ABC Inc. as a senior computer scientist.
Senior Computer Scientist 
- Led a team of four computer scientists to design, develop and test software solutions for various clients using agile methodology.
- Implemented machine learning models for image recognition, natural language processing and recommendation systems using TensorFlow and PyTorch.
- Improved the performance and scalability of the software systems by optimizing the code, debugging errors and deploying cloud services using AWS.
- Collaborated with other developers, engineers and stakeholders to ensure the quality and functionality of the software products.
Computer Science Intern 
- Assisted senior computer scientists with software development projects using Python, Java and C++.
- Conducted research and analysis on various computer science topics such as artificial intelligence, data structures and algorithms.
- Created and maintained documentation, reports and presentations for the software development process.
- Learned new skills and technologies through online courses, workshops and mentorshipÂ².
Bachelor of Science in Computer Science | University of California, Berkeley | Sep 2019 - May 2021
- Graduated with a GPA of 3.8/4.0.
- Completed courses in programming languages, data structures, algorithms, databases, operating systems, software engineering and machine learning.
- Participated in various extracurricular activities such as coding clubs, hackathons and student organizations.
- Programming Languages: Python, Java, C++, SQL
- Frameworks and Libraries: TensorFlow, PyTorch, OpenCV, Flask
- Tools and Platforms: AWS, Git, Android Studio, Heroku
"""

## Doc2Vec

In [3]:
data = [gensim.models.doc2vec.TaggedDocument(words=word_tokenize(group_1), tags=['computer science']),
        gensim.models.doc2vec.TaggedDocument(words=word_tokenize(group_2), tags=['data science']),
        gensim.models.doc2vec.TaggedDocument(words=word_tokenize(group_3), tags=['security guard'])]

In [4]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=500, min_count=2, epochs=5)
model.build_vocab(data)
model.train(data, total_examples=model.corpus_count, epochs=model.epochs)
model.save('my_doc2vec_model')
model = gensim.models.doc2vec.Doc2Vec.load('my_doc2vec_model')

In [5]:
new_document_vector = (model.infer_vector(word_tokenize(example_resume_text)))
model.dv.most_similar([new_document_vector], topn=3)

[('computer science', 0.47634610533714294),
 ('data science', 0.40098142623901367),
 ('security guard', 0.3115231990814209)]

In [6]:
document_embeddings=np.zeros((3,500))
for i in range(3):
    document_embeddings[i]=model.dv[i]
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_similarities

array([[1.        , 0.33651694, 0.28686826],
       [0.33651694, 1.        , 0.28662256],
       [0.28686826, 0.28662256, 1.        ]])