In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_excel('The_code_change_defects.xlsx')

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Fit and transform the "defect_description" column
df['defect_description_encoded'] = label_encoder.fit_transform(df['defect_description'])

# Perform tokenization and vectorization (TF-IDF) on the code snippets
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
tfidf_matrix = vectorizer.fit_transform(df['code_change_snippet'])

# The 'tfidf_matrix' is a sparse matrix containing the TF-IDF weights
# Let's convert this matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenate the 'defect_description_encoded' column to the main DataFrame
tfidf_df = pd.concat([tfidf_df, df['defect_description_encoded']], axis=1)

# Now 'tfidf_df' has 143 columns, with the last column being 'defect_description_encoded'
tfidf_df.head(10)


c:\Users\warun\anaconda3\envs\tensorflow\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\warun\anaconda3\envs\tensorflow\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


Unnamed: 0,0,05,1,10,1000,15,2,200,5,__init__,...,validate_email,validate_input,value,valueerror,while,with,x,y,zerodivisionerror,defect_description_encoded
0,0.0,0.0,0.0,0.0,0.0,0.0,0.228689,0.0,0.0,0.0,...,0.0,0.228689,0.0,0.121808,0.0,0.0,0.0,0.0,0.0,30
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.186957,0.0,0.0,0.182785,0.0,0.0,0.0,0.0,0.0,21
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24
4,0.156752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.194058,0.0,0.0,0.0,20
7,0.190678,0.0,0.104208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.206183,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102027,...,0.0,0.0,0.0,0.0,0.111656,0.0,0.0,0.0,0.0,17


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming 'X' is your feature matrix and 'y' is the target variable
X = tfidf_df.drop('defect_description_encoded', axis=1)
y = tfidf_df['defect_description_encoded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.4


In [6]:
# # New code snippet for prediction
# new_code_snippet = """your new code snippet here"""

# # Vectorize the new code snippet
# new_snippet_vectorized = vectorizer.transform([new_code_snippet])

# # Make a prediction
# new_prediction_encoded = rf_classifier.predict(new_snippet_vectorized)

# # Decode the prediction back to the original class label
# new_prediction_label = label_encoder.inverse_transform(new_prediction_encoded)

# print(f"The predicted defect description is: {new_prediction_label[0]}")


The predicted defect description is: ImportError




In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Load the dataset from the Excel file
df = pd.read_excel('The_code_change_defects.xlsx')

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")

# Train the vectorizer on the 'code_change_snippet' column
tfidf_matrix = vectorizer.fit_transform(df['code_change_snippet'])

# Save the trained vectorizer to a pickle file
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)


In [9]:
import pickle

# After training the RandomForestClassifier, save it to a pickle file
with open('rf_classifier.pkl', 'wb') as file:
    pickle.dump(rf_classifier, file)

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# Load the dataset from the Excel file
df = pd.read_excel('The_code_change_defects.xlsx')

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Fit the LabelEncoder to the 'defect_description' column
label_encoder.fit(df['defect_description'])

# Save the trained LabelEncoder to a pickle file
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)


In [18]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the saved model and other objects from pickle files
with open('rf_classifier.pkl', 'rb') as file:
    rf_classifier = pickle.load(file)

with open('vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

with open('label_encoder.pkl', 'rb') as file:
    label_encoder = pickle.load(file)

# New code snippet for prediction
new_code_snippet = """
def calculate_average(numbers):
    total = 0
    count = 0
    for num in numbers:
        total += num
        count += 1
    return total / count
"""

# Transform the new code snippet using the loaded TF-IDF vectorizer
new_snippet_vectorized = vectorizer.transform([new_code_snippet])

# Make a prediction with the loaded RandomForestClassifier
new_prediction_encoded = rf_classifier.predict(new_snippet_vectorized)

# Decode the prediction to get the original defect description
new_prediction_label = label_encoder.inverse_transform(new_prediction_encoded)

# Output the predicted defect description
print(f"The predicted defect description is: {new_prediction_label[0]}")


The predicted defect description is: Division by zero when 'numbers' list is empty


