## Data Exploration
This notebook will allow us to explore the data found in the Kaggle dataset

In [31]:
# imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import html
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk


In [32]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/vivianzhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vivianzhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vivianzhu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
df = pd.read_csv('dataset/cleaned.csv')

In [34]:
df.head()

Unnamed: 0,Id,Tag,Score_question,Title,Body_question,Score_answer,Body_answer
0,469,"['python', 'osx', 'fonts', 'photoshop']",21,find full path font display name mac,using photoshop javascript api find font given...,4,open terminal terminal type locate insertfonth...
1,469,"['python', 'osx', 'fonts', 'photoshop']",21,find full path font display name mac,using photoshop javascript api find font given...,2,not able find anything directly think iterate ...
2,469,"['python', 'osx', 'fonts', 'photoshop']",21,find full path font display name mac,using photoshop javascript api find font given...,12,unfortunately api not deprecated located appli...
3,469,"['python', 'osx', 'fonts', 'photoshop']",21,find full path font display name mac,using photoshop javascript api find font given...,1,must method cocoa get list font would use pyob...
4,502,"['python', 'windows', 'image', 'pdf']",27,get preview jpeg pdf window,python application need generate jpeg preview ...,9,use imagemagick convert utility see example ht...


In [35]:
df.columns

Index(['Id', 'Tag', 'Score_question', 'Title', 'Body_question', 'Score_answer',
       'Body_answer'],
      dtype='object')

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 987122 entries, 0 to 987121
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Id              987122 non-null  int64 
 1   Tag             987122 non-null  object
 2   Score_question  987122 non-null  int64 
 3   Title           986547 non-null  object
 4   Body_question   987101 non-null  object
 5   Score_answer    987122 non-null  int64 
 6   Body_answer     986275 non-null  object
dtypes: int64(3), object(4)
memory usage: 52.7+ MB


In [37]:
df.describe()

Unnamed: 0,Id,Score_question,Score_answer
count,987122.0,987122.0,987122.0
mean,20534960.0,7.275413,3.028437
std,11954860.0,63.667863,21.263246
min,469.0,-44.0,-38.0
25%,9968532.0,0.0,0.0
50%,21035500.0,1.0,1.0
75%,31152330.0,3.0,3.0
max,40143190.0,5524.0,8384.0


In [None]:
#remove empty rows
df = df[df['Body_question'].str.strip() != ""]
df = df[df['Body_answer'].str.strip() != ""]
#remove duplicates from body_question
df = df.drop_duplicates(subset=['Body_question'], keep='first')
#filter if question less than 2 words long
df = df[df['Body_question'].str.split().str.len() > 2]

In [None]:
vectorizer = TfidfVectorizer(max_features = 5000, ngram_range = (1,2))
tfidf_matrix = vectorizer.fit_transform(df['Body_question'])
responses = df['Body_answer'].tolist()


In [None]:
contraction_map = {
   "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "isn't": "is not",
    "mightn't": "might not",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "wouldn't": "would not",
    
    # Pronoun contractions
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    
    # Misc contractions
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "here's": "here is",
    "there's": "there is",
    "when's": "when is",
    "where's": "where is",
    "why's": "why is",
    "how's": "how is",
    "y'all": "you all",
    "o'clock": "of the clock",
    
    # Informal / common text contractions
    "ma'am": "madam",
    "gonna": "going to",
    "wanna": "want to",
    "gotta": "got to",
    "lemme": "let me",
    "gimme": "give me",
    "kinda": "kind of",
    "ain’t": "am not",
    "y’all": "you all",
    "could’ve": "could have",
    "should’ve": "should have",
    "would’ve": "would have",
    "might’ve": "might have",
    "must’ve": "must have",
    "shan’t": "shall not",
    "let’s": "let us"
}
def expand_contractions(text):
    for contraction, expanded in contraction_map.items():
        text = text.replace(contraction, expanded)
    return text

stop_words = set(stopwords.words('english')) - {"not", "no", "never"}
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Expand contractions
    text = expand_contractions(text)

    # Remove HTML tags if any
    text = html.unescape(text)
    text = re.sub(r'<[^>]+>', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Keep only alphabetic tokens
    tokens = [w for w in tokens if w.isalpha()]

    # Lowercase and remove stopwords
    tokens = [w.lower() for w in tokens if w.lower() not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return " ".join(tokens)


In [None]:
def get_answer(input):
    processed = preprocess_text(input)
    input_vec = vectorizer.transform([processed])
    similarity = cosine_similarity(input_vec, tfidf_matrix).flatten()
    index = similarity.argmax()
    score = similarity[index]

    if score < 0.2:
        return "I am unsure of the answer."
    return responses[index]


In [None]:
print(get_answer("how do I fix a python error?"))

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/vivianzhu/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
