<a href="https://colab.research.google.com/github/utkarshxgupta/VENOM-v1/blob/master/attempt4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
from textblob import TextBlob
import json
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import glob

### Paths to Q1 and Q2 folders

In [28]:
q1_dir = '/content/drive/MyDrive/json_data/Q1/'
q2_dir = '/content/drive/MyDrive/json_data/Q2/'

### Mount Google Drive

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Function to concatenate all text fields by the same speakerId

In [30]:
def get_concatenated_transcript(insights):
    speaker_transcripts = {}

    for entry in insights['transcript']:
        speaker_id = entry['speakerId']
        text = entry['text']

        # Concatenate text for the same speaker
        if speaker_id in speaker_transcripts:
            speaker_transcripts[speaker_id] += " " + text
        else:
            speaker_transcripts[speaker_id] = text

    # Concatenate all text from all speakers (assuming one speaker for the applicant)
    full_transcript = " ".join([transcript for transcript in speaker_transcripts.values()])
    return full_transcript

### Function to calculate keyword density


In [31]:
def calculate_keyword_density(transcript, keywords):
    words = transcript.split()
    keyword_count = sum([1 for word in words if word.lower() in keywords])
    return keyword_count / len(words) if len(words) > 0 else 0

### Function to extract features from a JSON file (handling concatenated transcript)


In [32]:
def process_json_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    # Get concatenated transcript for all entries with the same speakerId
    transcript = get_concatenated_transcript(data['videos'][0]['insights'])
    # print(transcript)
    # Sentiment analysis using TextBlob
    sentiment_score = TextBlob(transcript).sentiment.polarity

    # Extract keywords from insights
    keywords = []
    if 'keywords' in data['videos'][0]['insights']:  # Check if 'keywords' key exists
        keywords = [kw['text'] for kw in data['videos'][0]['insights']['keywords']]

    # Extract emotions
    emotions = {}
    if 'emotions' in data['videos'][0]['insights']:
      emotions = {emotion['type']: emotion['instances'][0]['confidence'] for emotion in data['videos'][0]['insights']['emotions']}
    # print(emotions)
    # Return extracted features
    return {
        'transcript': transcript,
        'sentiment_score': sentiment_score,
        'keywords': " ".join(keywords),
        'emotions': emotions
    }

### Define relevant keywords for Q1 and Q2

In [33]:
q1_keywords = ['banking', 'career', 'growth', 'icici', 'opportunity']
q2_keywords = ['achievement', 'academic', 'special', 'finance', 'personal']

### Data storage

In [34]:
all_features = []

### Processing Q1 and Q2 files and extracting features

In [35]:
q1_files = sorted(glob.glob(os.path.join(q1_dir, '*.json')))
q2_files = sorted(glob.glob(os.path.join(q2_dir, '*.json')))

for q1_file, q2_file in zip(q1_files, q2_files):
    # Get applicant ID and name from the file name
    file_name = os.path.basename(q1_file).split('.')[0]  # Get filename without extension
    applicant_id, applicant_name = file_name.split('_', 1) # Split into ID and name

    # Process both Q1 and Q2 files
    q1_features = process_json_file(q1_file)
    q2_features = process_json_file(q2_file)

    # Calculate keyword density for Q1 and Q2
    q1_keyword_density = calculate_keyword_density(q1_features['transcript'], q1_keywords)
    q2_keyword_density = calculate_keyword_density(q2_features['transcript'], q2_keywords)

    # Emotion analysis
    q1_joy = q1_features['emotions'].get('Joy', 0)
    q2_joy = q2_features['emotions'].get('Joy', 0)

    # Combine features from Q1 and Q2 into a single row for the applicant
    combined_features = {
        'applicant_id': applicant_id,
        'applicant_name': applicant_name,  # Add applicant name
        'q1_keyword_density': q1_keyword_density,
        'q1_sentiment_score': q1_features['sentiment_score'],
        'q1_joy': q1_joy,
        'q2_keyword_density': q2_keyword_density,
        'q2_sentiment_score': q2_features['sentiment_score'],
        'q2_joy': q2_joy,
    }

    all_features.append(combined_features)

### Create DataFrame from the extracted features


In [36]:
df = pd.DataFrame(all_features)

### Normalize the features before feeding into the model

In [37]:
scaler = StandardScaler()
X = scaler.fit_transform(df[['q1_keyword_density', 'q1_sentiment_score', 'q1_joy', 'q2_keyword_density', 'q2_sentiment_score', 'q2_joy']])

### For simplicity, create dummy labels (this will vary depending on your actual labels for training the model). In a real-world scenario, you would have labels for successful vs. unsuccessful candidates

In [38]:
y = [1 if i % 2 == 0 else 0 for i in range(len(X))]  # Dummy labels for testing purposes

### Train a Random Forest model on the features (you can use your real labels here)


In [39]:
model = RandomForestClassifier()
model.fit(X, y)

### Predict probabilities of success for each candidate

In [40]:
df['final_score'] = model.predict_proba(X)[:, 1]  # Probability of success

### Sort candidates by their final score

In [41]:
df = df.sort_values(by='final_score', ascending=False)

### Output the ranked candidates to a CSV file

In [42]:
df[['applicant_id', 'applicant_name', 'final_score']].to_csv('ranked_candidates.csv', index=False)

print("Candidate scores have been successfully generated and saved to 'ranked_candidates.csv'.")

Candidate scores have been successfully generated and saved to 'ranked_candidates.csv'.
