In [34]:
import os
from pathlib import Path
import logging
import json
from datetime import datetime, timedelta
from GroqcloudLLM.text_extraction import extract_and_clean_text,clean_text
from GroqcloudLLM.main import ResumeParser
from Expericecal.total_exp import format_experience, calculator
from database.operations import ResumeOperations, SkillsTitlesOperations
from database.client import get_collection, get_skills_titles_collection
from core.vectorizer import Vectorizer
from fastapi import FastAPI, Request, HTTPException
# Initialize your parser with API keys (replace with your actual keys)

import pandas as pd
def process_resume_csv(file_path, text_to_add="", sample_size=30):
    """
    Reads a CSV file, adds given text above each resume text,
    and returns a randomly sampled DataFrame of 30 rows.
    """
    try:
        # Read CSV
        df = pd.read_csv(file_path)

        # Check if 'content' column exists
        if 'content' not in df.columns:
            raise ValueError("CSV must contain 'content' column")

        # Add text above each resume text
        df['content'] = text_to_add + df['content'].astype(str)

        # Randomly sample rows
        sampled_df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

        return sampled_df

    except FileNotFoundError:
        print(f"Error: Could not find CSV file at {file_path}")
        raise
    except Exception as e:
        print(f"Error processing CSV: {str(e)}")
        raise

parser = ResumeParser()
collection = get_collection()
skills_titles_collection = get_skills_titles_collection()
# Initialize database operations
skills_ops = SkillsTitlesOperations(skills_titles_collection)
vectorizer = Vectorizer()
resume_ops = ResumeOperations(collection, vectorizer)

# Create a router instance

# Configure logging
logging.basicConfig(
    filename="cleanup.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)


def extract_clean_text_from_raw(resume_text: str):
    """
    Endpoint to extract and clean text from raw resume text input for LLM model.
    """
    try:

        # Clean the input text
        cleaned_text = clean_text(resume_text)

        # Parse the cleaned resume
        resume_parser = parser.process_resume(cleaned_text)

   

        # Convert to dict if needed (already is in dict if process_resume returns it)
        if isinstance(resume_parser, str):
            resume_parser = json.loads(resume_parser)

        # Initialize total_experience if not present
        if "total_experience" not in resume_parser:
            resume_parser["total_experience"] = 0

        # Calculate experience
        res = calculator.calculate_experience(resume_parser)
        resume_parser["total_experience"] = format_experience(res[0], res[1])

        # Extract experience titles
        experience_titles = []
        if "experience" in resume_parser:
            for experience in resume_parser["experience"]:
                if "title" in experience:
                    experience_titles.append(experience["title"])

        # Extract skills
        skills = []
        if "skills" in resume_parser:
            skills = resume_parser["skills"]

        # Store in database
        resume_ops.create_resume(resume_parser)
        skills_ops.add_multiple_skills(skills)
        skills_ops.add_multiple_titles(experience_titles)

        logging.info(f"Added skills: {skills}")
        logging.info(f"Added experience titles: {experience_titles}")

        return "donw"

    except Exception as e:
        raise e
import pandas as pd
def process_resume_csv(file_path, text_to_add="", sample_size=30):
    """
    Reads a CSV file, adds given text above each resume text,
    and returns a randomly sampled DataFrame of 30 rows.
    """
    try:
        # Read CSV
        df = pd.read_csv(file_path)

        # Check if 'content' column exists
        if 'content' not in df.columns:
            raise ValueError("CSV must contain 'content' column")

        # Add text above each resume text
        df['content'] = text_to_add + df['content'].astype(str)

        # Randomly sample rows
        sampled_df = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

        return sampled_df

    except FileNotFoundError:
        print(f"Error: Could not find CSV file at {file_path}")
        raise
    except Exception as e:
        print(f"Error processing CSV: {str(e)}")
        raise

# Main execution with error handling
try:
    df = process_resume_csv(r"C:\Users\pveer\OneDrive\Desktop\Uphire\csvdatasets\resumesfinal1.csv")
    for index, row in df.iterrows():
        try:
            print(f"Processing resume {index + 1}")
            cleaned_text = extract_clean_text_from_raw(str(row['content']))
            import time
            # Optional: Add delay to avoid overwhelming server
            time.sleep(10)
            print(f"Successfully processed resume {index + 1}")
            # Optional: Add delay to avoid overwhelming server
            # time.sleep(1)
        except Exception as e:
            print(f"Error processing resume {index + 1}: {str(e)}")
            continue
except Exception as e:
    print(f"Fatal error: {str(e)}")
#"C:\Users\pveer\OneDrive\Desktop\Uphire\csvdatasets\resumesfinal1.csv"

Processing resume 1


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 54.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.69it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 48.70it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.78it/s]


Error processing resume 1: 'NoneType' object has no attribute 'strip'
Processing resume 2


Batches: 100%|██████████| 1/1 [00:00<00:00, 25.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.46it/s]


Successfully processed resume 2
Processing resume 3


Batches: 100%|██████████| 1/1 [00:00<00:00, 95.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 91.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.36it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.97it/s]


Successfully processed resume 3
Processing resume 4


Batches: 100%|██████████| 1/1 [00:00<00:00, 76.81it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 33.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 29.67it/s]


Successfully processed resume 4
Processing resume 5


Batches: 100%|██████████| 1/1 [00:00<00:00, 106.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 74.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.25it/s]


Successfully processed resume 5
Processing resume 6


Batches: 100%|██████████| 1/1 [00:00<00:00, 35.98it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 83.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.55it/s]


Successfully processed resume 6
Processing resume 7


Batches: 100%|██████████| 1/1 [00:00<00:00, 66.65it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.92it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 64.47it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.73it/s]


Successfully processed resume 7
Processing resume 8


Batches: 100%|██████████| 1/1 [00:00<00:00, 31.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 77.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.00it/s]


Successfully processed resume 8
Processing resume 9


Batches: 100%|██████████| 1/1 [00:00<00:00, 107.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 86.56it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 60.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.98it/s]


Successfully processed resume 9
Processing resume 10
Error parsing resume: Invalid json output: {"name": "saeshta", "contact_details": {"email": "saeshta@gmail.com", "phone": "+91-752207702", "address": "75, Karpe Zila, Maheshtala-615167", "linkedin": "https://www.linkedin.com/in/saeshta"}, "education": [{"degree": "Advance Diploma in Hotel Management", "institution": "J&W Institute of Hotel Management", "dates": "January 2010"}], "experience": [{"title": "Sales, Fitness, Operations", "company": "Sculpt Fitness Centre", "start_date": "2015", "end_date": "Present", "duration": "7 years, 0 months"}, {"title": "Fitness, Sales, Administration, Facility & House-keeping", "company": "AB's Fitness Club", "start_date": "", "end_date": "", "duration": ""}, {"title": "Sales, Fitness, Operations", "company": "AB's Fitness Club", "start_date": "", "end_date": "", "duration": ""}, {"title": "", "company": "Ultimate Fitness Club", "start_date": "", "end_date": "", "duration": ""}, {"title": "Sales, 

Batches: 100%|██████████| 1/1 [00:00<00:00, 103.82it/s]


Successfully processed resume 10
Processing resume 11


Batches: 100%|██████████| 1/1 [00:00<00:00, 79.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 34.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.70it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.52it/s]


Successfully processed resume 11
Processing resume 12


Batches: 100%|██████████| 1/1 [00:00<00:00, 102.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 32.50it/s]


Successfully processed resume 12
Processing resume 13


Batches: 100%|██████████| 1/1 [00:00<00:00, 61.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.98it/s]


Successfully processed resume 13
Processing resume 14


Batches: 100%|██████████| 1/1 [00:00<00:00, 72.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.40it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 28.05it/s]


Successfully processed resume 14
Processing resume 15


Batches: 100%|██████████| 1/1 [00:00<00:00, 58.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 78.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 39.87it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.15it/s]


Successfully processed resume 15
Processing resume 16


Batches: 100%|██████████| 1/1 [00:00<00:00, 48.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.48it/s]


Successfully processed resume 16
Processing resume 17


Batches: 100%|██████████| 1/1 [00:00<00:00, 14.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 71.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 13.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 14.89it/s]


Successfully processed resume 17
Processing resume 18


Batches: 100%|██████████| 1/1 [00:00<00:00, 45.81it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 35.86it/s]


Successfully processed resume 18
Processing resume 19


Batches: 100%|██████████| 1/1 [00:00<00:00, 21.79it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 63.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.39it/s]


Successfully processed resume 19
Processing resume 20


Batches: 100%|██████████| 1/1 [00:00<00:00, 24.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 72.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.53it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 15.04it/s]


Successfully processed resume 20
Processing resume 21


Batches: 100%|██████████| 1/1 [00:00<00:00, 22.51it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 47.52it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 28.48it/s]


Successfully processed resume 21
Processing resume 22


Batches: 100%|██████████| 1/1 [00:00<00:00, 54.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 32.91it/s]


Successfully processed resume 22
Processing resume 23


Batches: 100%|██████████| 1/1 [00:00<00:00, 110.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 87.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 79.80it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 51.45it/s]


Successfully processed resume 23
Processing resume 24


Batches: 100%|██████████| 1/1 [00:00<00:00, 46.90it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 83.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 36.36it/s]


Successfully processed resume 24
Processing resume 25


Batches: 100%|██████████| 1/1 [00:00<00:00, 59.32it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 90.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 82.75it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 32.82it/s]


Successfully processed resume 25
Processing resume 26


Batches: 100%|██████████| 1/1 [00:00<00:00, 45.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 59.74it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 66.97it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.24it/s]


Successfully processed resume 26
Processing resume 27


Batches: 100%|██████████| 1/1 [00:00<00:00, 53.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 49.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 99.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 89.96it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 23.78it/s]


Successfully processed resume 27
Processing resume 28
Error parsing resume: Invalid json output: {
  "name": "bhajan singh rodhan",
  "contact_details": {
    "email": "bhajan.singh.rodhan@gmail.com",
    "phone": "+91-745936043",
    "address": "68/739, Gokhale Street, Aurangabad 459400",
    "linkedin": "https://www.linkedin.com/in/bhajan-singh-rodhan"
  },
  "education": [
    {
      "degree": "MBA Marketing & Sales",
      "institution": "Amity University",
      "dates": "January 2017"
    },
    {
      "degree": "Bsc. Hotel Management",
      "institution": "P.S.G College of arts and sciences",
      "dates": "January 2015"
    }
  ],
  "experience": [
    {
      "title": "Sales Manager",
      "company": "Cohesive Technologies",
      "start_date": "",
      "end_date": "",
      "duration": "",
      
    }
  ],
  "projects": [],
  "total_experience": "",
  "skills": [
    "Multi-tasking",
    "Collaborative",
    "Optimistic Thinking",
    "Effective teamleader/team trainer

Batches: 100%|██████████| 1/1 [00:00<00:00, 62.50it/s]


Successfully processed resume 28
Processing resume 29


Batches: 100%|██████████| 1/1 [00:00<00:00, 27.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 69.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 60.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.31it/s]


Successfully processed resume 29
Processing resume 30


Batches: 100%|██████████| 1/1 [00:00<00:00, 52.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 50.89it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.76it/s]


Successfully processed resume 30


In [None]:

# Main execution with error handling
try:
    df = process_resume_csv(r"C:\Users\pveer\OneDrive\Desktop\Uphire\csvdatasets\resumesfinal1.csv")
    for index, row in df.iterrows():
        try:
            print(f"Processing resume {index + 1}")
            cleaned_text = extract_clean_text_from_raw(str(row['content']))
            import time
            # Optional: Add delay to avoid overwhelming server
            time.sleep(10)
            print(f"Successfully processed resume {index + 1}")
            # Optional: Add delay to avoid overwhelming server
            # time.sleep(1)
        except Exception as e:
            print(f"Error processing resume {index + 1}: {str(e)}")
            continue
except Exception as e:
    print(f"Fatal error: {str(e)}")

Processing resume 1
Error processing resume 1: The file Name: sunita
Phone: +91-915984349
Address: 599, Koshy Path, Haldia 410811
LinkedIn: https://www.linkedin.com/in/sunita
Gmail: sunita@gmail.com
Other Number: +91-753585952

TECHNICAL SKILLS Programming Languages: Java (Servlet, JSP, Spring Boot). Web Technology: HTML5, CSS3, Bootstrap, JavaScript, JQuery, Ajax, AngularJs. Database: MySQL. IDE and Tool: Eclipse, spring tool Suit, Net beans, Sublime Text, Atom. Operating System: Windows XP, 7, 8, 10. ACHIEVEMENT â¢ Java Developer Certificate from Unanth Technical Institute. â¢ Java Certificate from solo Learn. â¢ Command line crash Course certificate from Udemy. JOB DETAILS Education Details 
January 2018 M.C.A  Pune, Maharashtra Pune University
January 2015 B.C.A  Amravati, Maharashtra Amravati University
January 2012 H.S.C  Amravati, Maharashtra Amravati University
January 2010 S.S.C  Amravati, Maharashtra Amravati University
Java developer 

Full Stack Java Developer
Skill Deta

In [None]:
df = process_resume_csv(r"C:\Users\pveer\OneDrive\Desktop\Uphire\csvdatasets\resumesfinal1.csv")
s=df.iloc[0].content
print(extract_and_clean_text(s))

In [None]:
from fastapi import APIRouter, Query, Body, Depends, HTTPException
from database.client import get_collection
from core.vectorizer import Vectorizer
from core.helpers import format_resume
from typing import List, Dict
import pymongo
import re
import re
collection = get_collection()
vectorizer = Vectorizer()
def extract_skills(raw_data):
    """
    Extracts individual skills from raw skill text while handling various formats.

    Args:
        raw_data (str): Raw skills string that may contain categories and groupings

    Returns:
        List[str]: Clean list of individual skills
    """
    # Remove category labels (anything before ":")
    cleaned = re.sub(r".*?:", "", raw_data)

    # Handle parenthetical groups
    # Extract content within parentheses and add to main list
    parenthetical_content = re.findall(r"\((.*?)\)", cleaned)
    cleaned = re.sub(r"\(.*?\)", ",", cleaned)

    # Split by common separators
    skills = re.split(r"[,/&]|\band\b", cleaned)

    # Add back parenthetical content
    for content in parenthetical_content:
        skills.extend(re.split(r"[,\s]", content))

    # Clean up each skill
    processed_skills = []
    for skill in skills:
        # Remove special characters and extra whitespace
        skill = re.sub(r"[^\w\s-]", "", skill)
        skill = skill.strip().lower()

        # Skip empty strings and common words to filter
        if skill and skill not in ["others", "and", "in", "of"]:
            processed_skills.append(skill)
     
    return processed_skills



def autocomplete_skills(
    prefix: str ='py',
    limit: int = 10,
):
    try:
        # Fetch raw skills data from the database
        pipeline = [
            {"$unwind": "$skills"},
            {"$match": {"skills": {"$regex": f".*{prefix}.*", "$options": "i"}}},
            {"$group": {"_id": "$skills"}},
            {"$limit": limit * 5},  # Increase limit to account for splitting
            {"$project": {"raw_skill": "$_id", "_id": 0}},
        ]

        raw_results = list(collection.aggregate(pipeline))
        raw_skills = [result["raw_skill"] for result in raw_results]

        # For debugging
        print(f"Raw skills fetched: {raw_skills}")

        # Extract and process individual skills
        processed_skills = []
        for raw_skill in raw_skills:
            extracted = extract_skills(raw_skill)
            processed_skills.extend(extracted)
            # For debugging
            print(f"Extracted skills from '{raw_skill}': {extracted}")

        # Deduplicate and filter by prefix
        filtered_skills = [
            skill for skill in set(processed_skills) if prefix.lower() in skill.lower()
        ]

        # For debugging
        print(f"Filtered skills: {filtered_skills}")

        # Limit the results
        return filtered_skills[:limit]

    except Exception as e:
        raise HTTPException(
            status_code=500, detail=f"Skills autocomplete failed: {str(e)}"
        )
autocomplete_skills()

Raw skills fetched: ['python', '•Software languages: Python', 'Python', 'Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery', 'Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning']
['python']
Extracted skills from 'python': ['python']
['python']
Extracted skills from '•Software languages: Python': ['python']
['python']
Extracted skills from 'Python': ['python']
['python', 'sql', 'java', 'javascript', 'jquery', 'pandas', 'numpy', 'scipy', 'scikit-learn', 'matplotlib']
Extracted skills from 'Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery': ['python', 'sql', 'java', 'javascript', 'jquery', 'pandas', 'numpy', 'scipy', 'scikit-learn', 'matplotlib']
['regular expression', 'html', 'css', 'angular 6', 'logstash', 'kafka', 'python flask', 'git', 'docker', 'computer vision - o

['python flask', 'python', 'numpy', 'scipy']

In [13]:
import re

s = [
    'python',
    '•Software languages: Python',
    'Python',
    'Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery',
    'Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning'
]

def extract_skills(raw_data):
    """
    Extracts individual skills from raw skill text while handling various formats.

    Args:
        raw_data (str): Raw skills string that may contain categories and groupings

    Returns:
        List[str]: Clean list of individual skills
    """
    # Remove category labels (anything before ":")
    cleaned = re.sub(r".*?:", "", raw_data)

    # Handle parenthetical groups
    parenthetical_content = re.findall(r"\((.*?)\)", cleaned)
    cleaned = re.sub(r"\(.*?\)", ",", cleaned)

    # Split by common separators
    skills = re.split(r"[,/&]|\band\b", cleaned)

    # Add back parenthetical content
    for content in parenthetical_content:
        skills.extend(re.split(r"[,\s]", content))

    # Clean up each skill
    processed_skills = []
    for skill in skills:
        # Remove special characters and extra whitespace
        skill = re.sub(r"[^\w\s-]", "", skill)
        skill = skill.strip().lower()

        # Skip empty strings and common words to filter
        if skill and skill not in ["others", "and", "in", "of"]:
            processed_skills.append(skill)

    return processed_skills


# Final flat list of all skills
t = []
for raw_skill in s:
    extracted = extract_skills(raw_skill)
    t.extend(extracted)  # Use extend() instead of append()

print("Final output:")
print(t)

Final output:
['python', 'python', 'python', 'python', 'sql', 'java', 'javascript', 'jquery', 'pandas', 'numpy', 'scipy', 'scikit-learn', 'matplotlib', 'regular expression', 'html', 'css', 'angular 6', 'logstash', 'kafka', 'python flask', 'git', 'docker', 'computer vision - open cv', 'understanding of deep learning']
