In [1]:
import torch

# Check for PyTorch availability
if not torch.cuda.is_available():
  print("WARNING: PyTorch is not using GPU. Consider installing GPU drivers or using CPU version.")

from transformers import BertTokenizer, BertForSequenceClassification

# Pre-trained sentiment analysis model (BERT)
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary sentiment

# Weighting factors (adjust these based on your priorities)
sentiment_weight = 0.6
clarity_weight = 0.3
market_fit_weight = 0.1

def get_sentiment_score(text):
  """Uses BERT to get sentiment score (positive: 1, negative: 0)"""
  encoded_text = tokenizer(text, return_tensors='pt')
  with torch.no_grad():
    outputs = model(**encoded_text)
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
  return predictions[:, 1].item()  # Probability of positive sentiment

def get_clarity_score(text):
  """Simple rule-based check for clarity (modify as needed)"""
  word_count = len(text.split())
  if word_count <= 10:
    return 1
  else:
    return 0.5

def get_market_fit_score(text, keywords):
  """Checks for keywords related to market fit (adjust keywords)"""
  score = 0
  for keyword in keywords:
    if keyword in text.lower():
      score += 0.2
  return min(score, 1)  # Limit to max score of 1

def get_text_score(text, keywords):
  """Combines sentiment, clarity, and market fit scores"""
  sentiment = get_sentiment_score(text)
  clarity = get_clarity_score(text)
  market_fit = get_market_fit_score(text, keywords)
  return (sentiment * sentiment_weight) + (clarity * clarity_weight) + (market_fit * market_fit_weight)

# Example usage


# Example usage
keywords = [
"India's", "Leading", "Digital", "Payments", "Platform",
"Changing", "the", "Way", "India", "Pays",
"Hunger", "Doesn't", "Wait",
"Whatever", "You", "Crave", "We", "Deliver",
"Learning", "The", "Future", "Here",
"Beauty", "at", "Your", "Fingertips",
"Simplifying", "Insurance",
"Living", "Made", "Easy",
"Delivering", "Possibility",
"Making", "Business", "Easy",
"Payments", "You", "Can", "Trust",
"See", "Clearly", "Look", "Great",
"Your", "Perfect", "Trip", "Starts", "Here",
"Your", "Ride", "Your", "Way",
"The", "One-Stop", "Shop", "for", "Everything",
"Simplifying", "Car", "Buying",
"Transforming", "Lives", "Through", "Fitness",
"On", "Your", "Way", "to", "Greatness",
"Skills", "for", "the", "Future",
"Your", "Credit", "Card", "Bill", "Partner",
"Powering", "seamless", "payments",
"Your", "Style", "Your", "Way",
"Reselling", "Simplified",
"Future", "of", "Mobility",
"Same-Day", "Delivery", "Every", "City",
"Furniture", "for", "Happy", "Homes",
"Learn", "Anything", "Anytime", "Anywhere",
"Everything", "for", "Your", "Baby",
"Comprehensive", "service", "in", "Russian",
"Reimagine", "Learning","Patented"
]
text="Our compression doesn't need to upload & download any files it compresses the files right inside the user's machine. The final size reduces upto 90% at the exact same quality. Our software also supports any type of GPU of local machine. That's why our compression is way ahead of the competitors."
score = get_text_score(text, keywords)

# Scale and round the score to an integer (adjust range as needed)
final_score = int(round(score * 10))  # Scales score to 0-10 range

print(f"Text: {text}")
print(f"Final Score: {final_score}")




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text: Our compression doesn't need to upload & download any files it compresses the files right inside the user's machine. The final size reduces upto 90% at the exact same quality. Our software also supports any type of GPU of local machine. That's why our compression is way ahead of the competitors.
Final Score: 5


In [2]:
score = get_sentiment_score("Yash is happy")
score

0.4169823229312897

In [3]:
import torch
import warnings

# Suppress potential warnings
warnings.filterwarnings("ignore", category=UserWarning)

from transformers import BertTokenizer, BertForSequenceClassification



# Pre-trained sentiment analysis model (BERT)
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary sentiment

# Weighting factors (adjust these based on your priorities)
sentiment_weight = 0.6
clarity_weight = 0.3
market_fit_weight = 0.1

def get_sentiment_score(text):
  """Uses BERT to get sentiment score (positive: 1, negative: 0)"""
  encoded_text = tokenizer(text, return_tensors='pt')
  with torch.no_grad():
    outputs = model(**encoded_text)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return predictions[:, 1].item()  # Probability of positive sentiment

def get_clarity_score(text):
  """Simple rule-based check for clarity (modify as needed)"""
  word_count = len(text.split())
  if word_count <= 10:
    return 1
  else:
    return 0.5

def get_text_score(text):
  """Combines sentiment, clarity, and pre-defined market fit scores"""
  sentiment = get_sentiment_score(text)
  clarity = get_clarity_score(text)
  market_fit = 0.5  # Placeholder score for missing keywords check
  return (sentiment * sentiment_weight) + (clarity * clarity_weight) + (market_fit * market_fit_weight)

def get_final_score(text):
  """Calculates and returns the final text score"""
  score = get_text_score(text)
  final_score = int(round(score * 10))  # Scales score to 0-10 range
  return final_score

# Example usage


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import pandas as pd
da_frame = pd.read_csv("your_data.csv")
# Define your get_final_score function


# Iterate over each row of the 'USP' column
for index, row in da_frame.iterrows():
    # Get the text from the 'USP' column of the current row
    text = row['USP']
    
    # Apply the get_final_score function to the text
    score = get_final_score(text)
    
    # Replace the original value in the DataFrame with the score
    da_frame.at[index, 'USP'] = score

# Save the modified DataFrame back to a CSV file
output_file_path = 'C:\\Users\\aryan\\Desktop\\ShareEquity\\modified.csv'
da_frame.to_csv(output_file_path, index=False)


In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Load data (replace with your actual file path)
data = pd.read_csv("modified.csv")

# Feature columns (replace with actual column names)
features = ["Qualifications", "Alumni", "Team_Size", "Sentiment", "Funding_Stage", "Funding_Amount",
            "Stage", "Market_Size", "Team_Experience", "USP"]

# Target column (integer value to predict)
target = "Investment_Score"

# Convert Market_Size to numeric by removing commas
data["Market_Size"] = data["Market_Size"].str.replace(",", "").astype(float)

X = data[features]
y = data[target]

# Handle missing values (optional)
# Uncomment if needed and choose an appropriate imputation strategy
# imputer = SimpleImputer(strategy="mean")  # You can choose other strategies like "median", "most_frequent"
# X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Preprocess categorical features
categorical_features = ["Qualifications", "Alumni", "Sentiment", "Funding_Stage", "Stage"]
encoder = OneHotEncoder(handle_unknown='ignore')  # One-Hot Encoding
encoded_X = pd.DataFrame(encoder.fit_transform(X[categorical_features]).toarray(), columns=encoder.get_feature_names_out(categorical_features))

# Combine processed features
numerical_features = [feature for feature in features if feature not in categorical_features]
X_processed = pd.concat([X[numerical_features].reset_index(drop=True), encoded_X.reset_index(drop=True)], axis=1)

# Ensure all feature names are strings
X_processed.columns = X_processed.columns.astype(str)
X_processed['Funding_Amount'] = X_processed['Funding_Amount'].str.replace(',', '').astype(int)
X_processed['Funding_Amount'] = X_processed['Funding_Amount'].astype(int)


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Create and train the decision tree
model = DecisionTreeRegressor(max_depth=5)  # Adjust max_depth as needed

model.fit(X_train, y_train)

# Make predictions on new data
collabkart_data = {
  "Qualifications": "B.Tech",  # Assuming all founders have B.Tech from IITM based on the information provided
  "Alumni": "IIT",
  "Team_Size": 3,
  "Sentiment": "Positive",  # Assuming positive based on the information provided
  "Funding_Stage": "Pre-Seed",
  "Funding_Amount": 1400000,  # No funding mentioned
  "Stage": "MVP",
  "Market_Size": "20000000",  # Assuming target market is Indian influencers based on the information provided
  "Team_Experience": 4,  # Can't determine experience from info provided
  "USP": 7  # Matching algorithms for ideal influencer selection
}


new_data_df = pd.DataFrame([collabkart_data])

# Convert Market_Size in new_data to numeric by removing commas
new_data_df["Market_Size"] = new_data_df["Market_Size"].str.replace(",", "").astype(float)

# Preprocess new data similarly (categorical encoding)
new_data_encoded = pd.DataFrame(encoder.transform(new_data_df[categorical_features]).toarray(), columns=encoder.get_feature_names_out(categorical_features))
new_data_processed = pd.concat([new_data_df[numerical_features].reset_index(drop=True), new_data_encoded.reset_index(drop=True)], axis=1)

# Ensure all feature names are strings
new_data_processed.columns = new_data_processed.columns.astype(str)

# Predict investment score for the new data
predicted_score = model.predict(new_data_processed)[0]
print("Predicted Investment Score:", predicted_score)


Predicted Investment Score: 90.0


In [6]:
data

Unnamed: 0,Qualifications,Alumni,Team_Size,Sentiment,Revenue_Projection,Funding_Stage,Funding_Amount,Stage,Market_Size,Team_Experience,USP,Investment_Score
0,M.Tech,Other,4,Positive,Inter,Series A,100001,Product Market Fit,1000000.0,4,6,85
1,B.Tech,IIM,3,Negative,Inter,Pre-Seed,200000,Ideation,50000.0,2,7,40
2,PhD,Other,2,Positive,Inter,Seed,500000,MVP,200000.0,5,7,90
3,MBA,BITS,5,Positive,Inter,Series A,2000000,Go To Market,800000.0,3,7,75
4,Others,IIT,1,Negative,Inter,Pre-Seed,250000,Ideation,10000.0,1,7,35


In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Load data (replace with your actual file path)
data = pd.read_csv("modified.csv")

# Feature columns (replace with actual column names)
features = ["Qualifications", "Alumni", "Team_Size", "Sentiment", "Funding_Stage", "Funding_Amount",
            "Stage", "Market_Size", "Team_Experience", "USP"]

# Target column (integer value to predict)
target = "Investment_Score"

# Convert Market_Size to numeric by removing commas
data["Market_Size"] = data["Market_Size"].str.replace(",", "").astype(float)

X = data[features]
y = data[target]

# Handle missing values (optional)
# Uncomment if needed and choose an appropriate imputation strategy
# imputer = SimpleImputer(strategy="mean")  # You can choose other strategies like "median", "most_frequent"
# X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Preprocess categorical features
categorical_features = ["Qualifications", "Alumni", "Sentiment", "Funding_Stage", "Stage"]
encoder = OneHotEncoder(handle_unknown='ignore')  # One-Hot Encoding
encoded_X = pd.DataFrame(encoder.fit_transform(X[categorical_features]).toarray(), columns=encoder.get_feature_names_out(categorical_features))

# Combine processed features
numerical_features = [feature for feature in features if feature not in categorical_features]
X_processed = pd.concat([X[numerical_features].reset_index(drop=True), encoded_X.reset_index(drop=True)], axis=1)

# Ensure all feature names are strings
X_processed.columns = X_processed.columns.astype(str)
X_processed['Funding_Amount'] = X_processed['Funding_Amount'].str.replace(',', '').astype(int)
X_processed['Funding_Amount'] = X_processed['Funding_Amount'].astype(int)


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Create and train the decision tree
model = DecisionTreeRegressor(max_depth=5)  # Adjust max_depth as needed

model.fit(X_train, y_train)

# Get feature importances
feature_importances = model.feature_importances_

# Print feature names and importances
for feature, importance in zip(X_processed.columns, feature_importances):
  print(f"{feature}: {importance:.4f}")

# Make predictions on new data (your code remains unchanged)
# ... your new data processing and prediction code here ...


Team_Size: 0.0000
Funding_Amount: 0.0000
Market_Size: 0.0000
Team_Experience: 0.0000
USP: 0.0000
Qualifications_B.Tech: 0.0000
Qualifications_M.Tech: 0.0000
Qualifications_MBA: 0.0000
Qualifications_Others: 0.0000
Qualifications_PhD: 0.0000
Alumni_BITS: 0.0000
Alumni_IIM: 0.0000
Alumni_IIT: 0.0000
Alumni_Other: 0.0557
Sentiment_Negative: 0.0000
Sentiment_Positive: 0.0000
Funding_Stage_Pre-Seed: 0.9376
Funding_Stage_Seed: 0.0000
Funding_Stage_Series A: 0.0000
Stage_Go To Market: 0.0000
Stage_Ideation: 0.0000
Stage_MVP: 0.0000
Stage_Product Market Fit: 0.0067
