# Software Effort Estimation Using LLMs

This research work aims to check and evaluate the performance of GPT-4 and LLAMA-3 towards software effort estimation in a zero shot setting and a trained setting.

# Data Exploration

In [1]:
# install libraries
!pip3 install pandas
!pip3 install numpy
!pip3 install matplotlib
!pip3 install wordcloud
!pip3 install seaborn
!pip3 install nltk
!pip3 install mysql-connector-python
!pip3 install transformers
!pip3 install torch


Collecting wordcloud
  Downloading wordcloud-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading wordcloud-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.1/511.1 kB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wordcloud
Successfully installed wordcloud-1.9.3
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x

In [2]:
!pip3 freeze > requirements.txt

In [3]:
# import libraries
import mysql.connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
import seaborn as sns
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import transformers
import torch

In [4]:
# Connect to MySQL
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="My-N7w_And.5ecure-P@s5w0rd",
    database="local"
)

DatabaseError: 2003 (HY000): Can't connect to MySQL server on 'localhost:3306' (111)

In [None]:
# Read table into DataFrame
query = "SELECT * FROM Issue"
df = pd.read_sql(query, conn)

In [None]:
# Close connection
conn.close()

In [None]:
# check shape
df.shape

there are about 450000 rows with 30 columns in the dataframe

In [None]:
# display first few rows of the dataframe
print(df.head())

In [None]:
# describe the df
df.describe()

In [None]:
# check column names
df.columns

In [None]:
# check types
df['Type'].value_counts()

In [None]:
# filter rows with type = 'Story'
df = df[df['Type'] == 'Story']

In [None]:
# check shape
df.shape

In [None]:
# extract only useful columns
df = df[['Description','Resolution_Time_Minutes']]

In [None]:
# check shape
df.shape

In [None]:
# check few rows
df.sample(10)

In [None]:
# remove rows with empty descriptions
df = df[df['Description'] != '']

In [None]:
# check shape
df.shape

In [None]:
# remove rows with 0 resolution minutes
df = df[df['Resolution_Time_Minutes'] != 0]

In [None]:
# check shape
df.shape

In [None]:
# check for null values
df['Description'].isnull().sum()

In [None]:
# check for null values
df['Resolution_Time_Minutes'].isnull().sum()

In [None]:
# add column for checking length of the descriptions
df['Description_Length'] = df['Description'].apply(lambda x: len(x.split()))

In [None]:
# describe the length column
df['Description_Length'].describe()

the average description length is around 57 characters

In [None]:
# setup ntlk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(nltk.corpus.stopwords.words('english'))

In [None]:
# define a function to normalize text
def normalize_text(sentence):
    # Convert to lowercase
    sentence = sentence.lower()
    
    # Remove punctuation
    sentence = sentence.translate(str.maketrans("", "", string.punctuation))
    
    # Remove numbers
    sentence = re.sub(r'\d+', '', sentence)
    
    # Remove extra whitespace
    sentence = ' '.join(sentence.split())
    
    # Expand contractions
    contractions = {
        "n't": " not",
        "'re": " are",
        "'s": " is",
        "'d": " would",
        "'ll": " will",
        "'ve": " have",
        "'m": " am"
    }
    for contraction, expansion in contractions.items():
        sentence = sentence.replace(contraction, expansion)
    
    # # Tokenize the sentence
    words = word_tokenize(sentence)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Perform stemming
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    
    # Join the words back into a sentence
    normalized_sentence = ' '.join(words)
        
    return normalized_sentence

In [None]:
# apply normalize_text function to descriptions and save to new column
df['Cleaned_Description'] = df['Description'].apply(normalize_text)

In [None]:
# check few rows
df['Cleaned_Description'].sample(10)

In [None]:
# check common words
all_words = ' '.join(df['Cleaned_Description']).split()
word_freq = Counter(all_words)
common_words = word_freq.most_common(10)
common_words

In [None]:
# show word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(all_words))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Descriptions')
plt.show()

In [None]:
# plot most common words
common_words_df = pd.DataFrame(common_words, columns=['Word', 'Frequency'])

plt.figure(figsize=(10, 5))
sns.barplot(data=common_words_df, x='Word', y='Frequency')
plt.title('Most Common Words in Descriptions')
plt.show()

# LLAMA - 3

In [None]:
# define llama3 model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
pipeline = transformers.pipeline(
  "text-generation",
  model="meta-llama/Meta-Llama-3-8B-Instruct",
  model_kwargs={"torch_dtype": torch.bfloat16},
  device="cuda",
)