<a href="https://colab.research.google.com/github/zafor158/cse412/blob/main/Code_Clone_Seeker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1: Install Required Libraries**

In [None]:
!pip install pandas



**Step 2: Import Libraries**

In [None]:
import pandas as pd
import json

**Step 3: Read JSONL File into a DataFrame**

In [None]:
# Replace 'our_dataset.jsonl' with the actual file path
file_path = '/content/data.jsonl'

# Read the JSONL file into a DataFrame
data = []
with open(file_path, 'r') as file:
    for line in file:
        json_line = json.loads(line)
        data.append(json_line)

df = pd.DataFrame(data)

**Step 4: Explore and Preprocess Data**

In [None]:
# Display the first few rows of the DataFrame
print(df.head())

                                                func       idx
0      public static void main(String[] args) {\n...  10000832
1      public synchronized String getSerialNumber...  10005623
2              public Object run() {\n           ...  10005624
3      public String post() {\n        if (conten...  10005674
4      @Override\n    public void onCreate(Bundle...  10005879


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
# Replace 'your_dataset.jsonl' with the actual file path
file_path = '/content/data.jsonl'
data = []

with open(file_path, 'r') as file:
    for line in file:
        json_line = json.loads(line)
        data.append(json_line)

df = pd.DataFrame(data)

# Tokenization, Lowercasing, Stopword Removal, and Lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_code(code):
    # Tokenize the code
    tokens = word_tokenize(code)

    # Lowercase, remove stopwords, and lemmatize
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]

    return ' '.join(tokens)

# Apply preprocessing to the 'func' field
df['cleaned_code'] = df['func'].apply(lambda x: preprocess_code(x))

# Display the final DataFrame with cleaned code snippets
print(df[['func', 'cleaned_code']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                                func  \
0      public static void main(String[] args) {\n...   
1      public synchronized String getSerialNumber...   
2              public Object run() {\n           ...   
3      public String post() {\n        if (conten...   
4      @Override\n    public void onCreate(Bundle...   

                                        cleaned_code  
0  public static void main string args int string...  
1  public synchronized string getserialnumber ser...  
2  public object run try messagedigest digest sha...  
3  public string post content null return type so...  
4  override public void oncreate bundle savedinst...  


**Step 5: Implement Machine Learning Model**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Assuming 'cleaned_code' is the key for preprocessed code snippets
code_snippets = df['cleaned_code']

# Tokenize and vectorize code snippets using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(code_snippets)

# Use linear kernel for similarity computation
similarity_matrix = linear_kernel(tfidf_matrix , tfidf_matrix)

**Streamlite App**

In [None]:
# app.py
%%writefile app.py
import streamlit as st
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk


# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the DataFrame
file_path = '/content/data.jsonl'  # Replace with the actual file path
data = []

with open(file_path, 'r') as file:
    for line in file:
        json_line = json.loads(line)
        data.append(json_line)

df = pd.DataFrame(data)

# Tokenization, Lowercasing, Stopword Removal, and Lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_code(code):
    tokens = word_tokenize(code)
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to the 'func' field and create a new 'cleaned_code' column
df['cleaned_code'] = df['func'].apply(lambda x: preprocess_code(x))

# Tokenize and vectorize code snippets using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_code'])

# Use linear kernel for similarity computation
similarity_matrix = linear_kernel(tfidf_matrix , tfidf_matrix)

# Streamlit App
def main():
    st.title("Duplicate Code Detection App")

    # User input
    user_code = st.text_area("Enter your code snippet here:")

    if st.button("Detect Similarity"):
        # Calculate similarity with all code snippets in the dataset
        similarity_percentages = [calculate_similarity(user_code, ref_code) for ref_code in df['cleaned_code']]

        # Display the maximum similarity percentage
        max_similarity_percentage = max(similarity_percentages)
        st.text(f"Maximum Similarity Percentage: {max_similarity_percentage:.2f}%")

def calculate_similarity(user_code, reference_code):
    # Tokenize and vectorize user code
    user_vector = tfidf_vectorizer.transform([user_code])

    # Tokenize and vectorize reference code
    reference_vector = tfidf_vectorizer.transform([reference_code])

    # Compute cosine similarity
    similarity = cosine_similarity(user_vector, reference_vector).flatten()[0]

    # Convert similarity to percentage
    #similarity_percentage = similarity * 100
    # Convert similarity to percentage (scaling to 0-100)
    similarity_percentage = (similarity + 1) * 50

    return similarity_percentage

if __name__ == "__main__":
    main()


Writing app.py


In [None]:
! pip install streamlit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!wget -q -O - ipv4.icanhazip.com

34.106.103.88


In [None]:
! streamlit run app.py & npx localtunnel --port 8501

[..................] / rollbackFailedOptional: verb npm-session dfb7704cf2e71df[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.106.103.88:8501[0m
[0m
[K[?25hnpx: installed 22 in 6.839s
your url is: https://eager-otters-relate.loca.lt
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package