PoC for case insights

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import io
from faker import Faker
import os


In [None]:

def generate_random_data(n_samples=100):
    fake = Faker()
    
    reasons_for_contact = [
        "Account related", "General Problem", "Informational", "Other",
        "Pre Sales", "Proactive", "Security Related", "Subscription / Licensing"
    ]
    cause_codes = [
        "Configuration", "Consultant File Transfer", "ESRP", "Functional Question", 
        "Hardware / Hardware Error", "Installation", "Licensing", "Other Unlisted", 
        "Security / Vulnerability", "Software / Software Error", "Upgrade"
    ]
    resolution_codes = [
        "Updated Configuration", "Upgraded Software / Engineering Hotfix", "Replaced Hardware", 
        "Educated Customer", "Updated Account", "Customer Resolved", "Applied Workaround", 
        "Updated License", "Not our issue", "Undetermined", "Unresolved", "Resolved by another group", 
        "Other Unlisted", "Lack of response", "Duplicate", "Bugzilla ID existing", 
        "Bugzilla ID New", "Request for Enhancement"
    ]
    product_types = [
        "BIG-IP APM", "BIG-IP ASM", "BIG-IP LTM", "BIG-IP AFM", "BIG-IP PEM",
        "BIG-IP Analytics", "BIG-IP Link Controller", "BIG-IP AAM", "BIG-IP DNS"
    ]
    product_versions = ['v13', 'v14', 'v15', 'v16', 'v17']

    dates = pd.date_range(start='2023-01-01', periods=n_samples, freq='D')
    product_types_sample = np.random.choice(product_types, size=n_samples)
    product_versions_sample = np.random.choice(product_versions, size=n_samples)
    subjects = [fake.sentence(nb_words=6) for _ in range(n_samples)]
    customer_descriptions = [fake.paragraph(nb_sentences=3) for _ in range(n_samples)]
    engineer_descriptions = [fake.paragraph(nb_sentences=3) for _ in range(n_samples)]
    environments = np.random.choice(['Env1', 'Env2', 'Env3'], size=n_samples)
    root_causes = [fake.sentence(nb_words=10) for _ in range(n_samples)]
    recommended_actions = [fake.sentence(nb_words=5) for _ in range(n_samples)]
    additional_info = [fake.sentence(nb_words=5) for _ in range(n_samples)]
    time_to_resolve = np.random.randint(1, 20, size=n_samples)
    knowledge_base_articles = np.random.choice(['KB1', 'KB2', 'KB3'], size=n_samples)
    bugzilla_ids = np.random.choice(['BZ1', 'BZ2', 'BZ3'], size=n_samples)
    reasons = np.random.choice(reasons_for_contact, size=n_samples)
    cause_codes_sample = np.random.choice(cause_codes, size=n_samples)
    resolution_codes_sample = np.random.choice(resolution_codes, size=n_samples)

    data = pd.DataFrame({
        'Date': dates,
        'Product type': product_types_sample,
        'Product version': product_versions_sample,
        'Subject': subjects,
        'Customer description': customer_descriptions,
        'Engineer description': engineer_descriptions,
        'Environment': environments,
        'Root Cause': root_causes,
        'Recommended actions': recommended_actions,
        'Additional Information': additional_info,
        'Time to resolve': time_to_resolve,
        'Knowledge base article if used': knowledge_base_articles,
        'Bugzilla ID if found': bugzilla_ids,
        'Reason for contact': reasons,
        'Cause code': cause_codes_sample,
        'Resolution code': resolution_codes_sample
    })
    return data

# Path to the CSV file
csv_file_path = 'values.csv'

# Check if the file exists and is not empty
if os.path.exists(csv_file_path):
    print(f"The file '{csv_file_path}' exists.")
    if os.path.getsize(csv_file_path) > 0:
        data = pd.read_csv(csv_file_path)
        print(f"The file '{csv_file_path}' is not empty. Data loaded from CSV file.")
    else:
        data = generate_random_data()
        print(f"The file '{csv_file_path}' is empty. Generated random data.")
else:
    data = generate_random_data()
    print(f"The file '{csv_file_path}' does not exist. Generated random data.")

# Manipulate the data to show recognizable trends
# Example trend: Longer resolution times for 'Security Related'
data.loc[data['Reason for contact'] == 'Security Related', 'Time to resolve'] = (data.loc[data['Reason for contact'] == 'Security Related', 'Time to resolve'] * 1.5).astype(int)

# Example trend: Specific product version having more 'Software / Software Error' issues
data.loc[data['Product version'] == 'v1.0', 'Root Cause'] = 'Software / Software Error'

# Adding recognizable patterns to other fields
# Example trend: Environment 'Env1' having more 'Configuration' issues
data.loc[data['Environment'] == 'Env1', 'Root Cause'] = 'Configuration'

# Example trend: Product type 'BIG-IP APM' having longer resolution times
data.loc[data['Product type'] == 'BIG-IP APM', 'Time to resolve'] += 5

print("Sample data with 100 examples loaded successfully.")
data.head()

In [None]:
# Data Exploration
# Display the first few rows of the dataframe
print(data.head())

# Summary statistics
print(data.describe())

# Check for missing values
print(data.isnull().sum())

# Data Visualization
# Distribution of the 'Reason for contact'
plt.figure(figsize=(12, 6))
sns.countplot(data['Reason for contact'])
plt.title('Distribution of Reason for Contact')
plt.xlabel('Reason for Contact')
plt.ylabel('Count')
plt.show()

# Distribution of Numerical Variables
numerical_columns = data.select_dtypes(include=[np.number]).columns
data[numerical_columns].hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

# Value Counts for Categorical Variables
categorical_columns = data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    print(f"Value counts for {column}:")
    print(data[column].value_counts())
    print()

# Outlier Detection
plt.figure(figsize=(12, 6))
sns.boxplot(data['Time to resolve'])
plt.title('Box Plot for Time to Resolve (Outlier Detection)')
plt.xlabel('Time to Resolve')
plt.show()


# Initial Analysis
# Example: Group by 'Reason for contact' and calculate mean 'Time to resolve'
analysis = data.groupby('Reason for contact')['Time to resolve'].mean()
print(analysis)




# Save analysis results to a CSV file
analysis.to_csv('initial_analysis.csv')


In [None]:
# Correlation Analysis
# Select only numerical columns for correlation analysis
numerical_data = data.select_dtypes(include=[np.number])

# Check the selected numerical columns
print("Numerical columns for correlation analysis:")
print(numerical_data.columns)

# Ensure we have more than one numerical column for correlation analysis
if numerical_data.shape[1] > 1:
    # Calculate the correlation matrix
    corr_matrix = numerical_data.corr()

    # Plot the correlation matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Matrix')
    plt.show()
else:
    print("Not enough numerical columns for correlation analysis.")

In [None]:
# Pearson Correlation Matrix
# Measures linear correlation between numerical variables.

plt.figure(figsize=(12, 10))
sns.heatmap(numerical_data.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Pearson Correlation Matrix')
plt.show()


In [None]:
# Spearman Correlation Matrix
# Measures rank correlation, useful for non-linear relationships.

plt.figure(figsize=(12, 10))
sns.heatmap(numerical_data.corr(method='spearman'), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Spearman Correlation Matrix')
plt.show()


In [None]:
# Box Plot for 'Time to resolve' by 'Reason for contact'
plt.figure(figsize=(12, 6))
sns.boxplot(x='Reason for contact', y='Time to resolve', data=data)
plt.title('Box Plot of Time to Resolve by Reason for Contact')
plt.show()

In [None]:
# Kendall Correlation Matrix
# Another measure of rank correlation, robust to small datasets and ties.
plt.figure(figsize=(12, 10))
sns.heatmap(numerical_data.corr(method='kendall'), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Kendall Correlation Matrix')
plt.show()


In [None]:
# Pair Plot (Scatterplot Matrix)
# Visualizes pairwise relationships and distributions of numerical variables.
sns.pairplot(numerical_data)
plt.suptitle('Pair Plot of Numerical Data', y=1.02)
plt.show()


In [None]:
# Enhanced Heatmap with Annotations
# Provides more detailed annotations on the heatmap.
plt.figure(figsize=(12, 10))
sns.heatmap(numerical_data.corr(), annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5, cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Enhanced Pearson Correlation Matrix')
plt.show()


In [None]:
# Correlation Matrix with Clustering
# Groups variables based on similarity in their correlation patterns.
from scipy.cluster.hierarchy import dendrogram, linkage

# Calculate the linkage
Z = linkage(numerical_data.corr(), 'ward')

# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z, labels=numerical_data.columns, leaf_rotation=90)
plt.title('Dendrogram of Correlation Matrix')
plt.xlabel('Variables')
plt.ylabel('Distance')
plt.show()


In [None]:
# Correlation with Categorical Data (Cramér's V) 
# Measures association between categorical variables.
import scipy.stats as ss
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    # Check to avoid division by zero
    if min((kcorr - 1), (rcorr - 1)) == 0:
        return np.nan
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

# Load the data from values.csv
data = pd.read_csv('values.csv')

# Select categorical columns
categorical_data = data.select_dtypes(include=['object']).copy()

# Handle missing values by filling with a placeholder
categorical_data = categorical_data.fillna('Missing')

# Remove columns with only one unique value
categorical_data = categorical_data[[col for col in categorical_data.columns if categorical_data[col].nunique() > 1]]

# Initialize the correlation matrix
categorical_corr = pd.DataFrame(index=categorical_data.columns, columns=categorical_data.columns)

# Compute Cramér's V for categorical columns
for col1 in categorical_data.columns:
    for col2 in categorical_data.columns:
        if col1 != col2:
            categorical_corr.loc[col1, col2] = cramers_v(categorical_data[col1], categorical_data[col2])
        else:
            categorical_corr.loc[col1, col2] = 1.0  # Set the diagonal to 1.0

categorical_corr = categorical_corr.astype(float)

# Plot the Categorical Correlation Matrix
plt.figure(figsize=(12, 10))
sns.heatmap(categorical_corr, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Categorical Correlation Matrix (Cramér\'s V)')
plt.show()


In [None]:
# Word Cloud for 'Customer description'
from wordcloud import WordCloud

text = " ".join(description for description in data['Customer description'])
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
# Sentiment Analysis of Customer Descriptions
data['Sentiment'] = data['Customer description'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Visualize the distribution of sentiment polarity
plt.figure(figsize=(12, 6))
sns.histplot(data['Sentiment'], bins=20, kde=True)
plt.title('Distribution of Sentiment in Customer Descriptions')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Count')
plt.show()


In [None]:
# Topic Modeling with LDA: Identify topics within customer descriptions.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Prepare the text data for topic modeling
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['Customer description'])

# Fit the LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)

# Display the top words for each topic
n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print(f"Topic #{topic_idx}: {' '.join(top_words)}")


In [None]:
# Analyze the length of customer descriptions
data['Description Length'] = data['Customer description'].apply(len)

# Visualize the distribution of description lengths
plt.figure(figsize=(12, 6))
sns.histplot(data['Description Length'], bins=20, kde=True)
plt.title('Distribution of Customer Description Lengths')
plt.xlabel('Description Length')
plt.ylabel('Count')
plt.show()


In [None]:
# Keyword Exctraction

from sklearn.feature_extraction.text import TfidfVectorizer

# Extract keywords using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Customer description'])
keywords = tfidf_vectorizer.get_feature_names_out()

# Display the top keywords
print("Top keywords:")
print(keywords)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['Customer description'])

# Fit KMeans clustering model
kmeans = KMeans(n_clusters=5, random_state=42)
data['Cluster'] = kmeans.fit_predict(X)

# Visualize clusters
plt.figure(figsize=(12, 6))
sns.countplot(x='Cluster', data=data)
plt.title('Distribution of Customer Description Clusters')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.show()


In [None]:
# N-gram Analysis from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def get_ngrams(text, n=2):
    words = [word for word in text.split() if word.lower() not in ENGLISH_STOP_WORDS]
    ngrams = zip(*[words[i:] for i in range(n)])
    return [' '.join(ngram) for ngram in ngrams]

# Extract bigrams from customer descriptions
bigrams = Counter()
for description in data['Customer description']:
    bigrams.update(get_ngrams(description, n=2))

# Display the most common bigrams
print("Most common bigrams:")
for bigram, count in bigrams.most_common(10):
    print(f"{bigram}: {count}")


*** Spacy NLP

In [None]:
# Import Libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from spacy.matcher import Matcher
from spacy.lang.en.stop_words import STOP_WORDS
import string

# Install the larger spaCy model
# !python -m spacy download en_core_web_md

# Load the spaCy model for NER and word vectors
nlp = spacy.load('en_core_web_md')

# Load the data from values.csv
data = pd.read_csv('values.csv')

# Function to extract n-grams
def extract_ngrams(text_series, n=2, top_n=20):
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english')
    X = vectorizer.fit_transform(text_series)
    ngrams = vectorizer.get_feature_names_out()
    counts = X.sum(axis=0).A1
    ngram_freq = dict(zip(ngrams, counts))
    sorted_ngrams = sorted(ngram_freq.items(), key=lambda x: x[1], reverse=True)
    return sorted_ngrams[:top_n]

# Extract top 20 bigrams and trigrams
top_bigrams = extract_ngrams(data['Customer description'], n=2, top_n=20)
top_trigrams = extract_ngrams(data['Customer description'], n=3, top_n=20)

# Generate patterns from top n-grams
def generate_patterns(ngrams, label):
    patterns = []
    for ngram, _ in ngrams:
        pattern = {"label": label, "pattern": ngram}
        patterns.append(pattern)
    return patterns

# Create patterns for EntityRuler
bigram_patterns = generate_patterns(top_bigrams, "BIGRAM")
trigram_patterns = generate_patterns(top_trigrams, "TRIGRAM")
patterns = bigram_patterns + trigram_patterns

# Create patterns for Matcher
def generate_matcher_patterns(ngrams):
    matcher_patterns = {}
    for ngram, _ in ngrams:
        words = ngram.split()
        pattern = [{"LOWER": word} for word in words]
        matcher_patterns[ngram] = pattern
    return matcher_patterns

matcher_patterns = generate_matcher_patterns(top_bigrams + top_trigrams)

# Part-of-Speech (POS) Tagging
def pos_tagging(text):
    doc = nlp(text)
    return [(token.text, token.pos_) for token in doc]

# Dependency Parsing
def dependency_parsing(text):
    doc = nlp(text)
    return [(token.text, token.dep_, token.head.text) for token in doc]

# Noun Phrase Extraction
def extract_noun_phrases(text):
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks]

# Text Summarization
def summarize_text(text, n_sentences=2):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return ' '.join(sentences[:n_sentences])

# Entity Ruler
def entity_ruler(text, patterns):
    if 'entity_ruler' not in nlp.pipe_names:
        ruler = nlp.add_pipe("entity_ruler", after="ner")
        ruler.add_patterns(patterns)
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Text Cleaning
def clean_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if token.text.lower() not in STOP_WORDS and token.text not in string.punctuation]
    return " ".join(tokens)

# Pattern Matching
def pattern_matching(text, patterns):
    matcher = Matcher(nlp.vocab)
    for pattern_id, pattern in patterns.items():
        matcher.add(pattern_id, [pattern])
    doc = nlp(text)
    matches = matcher(doc)
    return [(doc[start:end].text, nlp.vocab.strings[match_id]) for match_id, start, end in matches]

# Sentence Segmentation
def sentence_segmentation(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

# Semantic Similarity Analysis
def semantic_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

# Apply the functions to customer descriptions
data['Entities'] = data['Customer description'].apply(lambda x: [(ent.text, ent.label_) for ent in nlp(x).ents])
data['POS Tags'] = data['Customer description'].apply(pos_tagging)
data['Dependency Parse'] = data['Customer description'].apply(dependency_parsing)
data['Noun Phrases'] = data['Customer description'].apply(extract_noun_phrases)
data['Summary'] = data['Customer description'].apply(summarize_text)
data['Entity Ruler'] = data['Customer description'].apply(lambda x: entity_ruler(x, patterns))
data['Cleaned Text'] = data['Customer description'].apply(clean_text)
data['Pattern Matches'] = data['Customer description'].apply(lambda x: pattern_matching(x, matcher_patterns))
data['Sentences'] = data['Customer description'].apply(sentence_segmentation)

# Example similarity between the first two customer descriptions
similarity = semantic_similarity(data['Customer description'].iloc[0], data['Customer description'].iloc[1])
print(f"Similarity between the first two customer descriptions: {similarity:.2f}")

# Display some examples
print("Named Entities:\n", data[['Customer description', 'Entities']].head(), "\n")
print("POS Tags:\n", data[['Customer description', 'POS Tags']].head(), "\n")
print("Dependency Parse:\n", data[['Customer description', 'Dependency Parse']].head(), "\n")
print("Noun Phrases:\n", data[['Customer description', 'Noun Phrases']].head(), "\n")
print("Summary:\n", data[['Customer description', 'Summary']].head(), "\n")
print("Entity Ruler:\n", data[['Customer description', 'Entity Ruler']].head(), "\n")
print("Cleaned Text:\n", data[['Customer description', 'Cleaned Text']].head(), "\n")
print("Pattern Matches:\n", data[['Customer description', 'Pattern Matches']].head(), "\n")
print("Sentences:\n", data[['Customer description', 'Sentences']].head(), "\n")


In [None]:
# Extract keywords using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Customer description'])
keywords = tfidf_vectorizer.get_feature_names_out()

# Display the top keywords
print("Top keywords:")
print(keywords)


In [None]:
# Text Classification with Machine Learning

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Prepare data for text classification
X = data['Customer description']
y = data['Resolution code'].apply(lambda x: 1 if x == 'Resolved' else 0)  # Binary classification: Resolved vs. Unresolved

# Check the distribution of the target variable
print(y.value_counts())

# Ensure there are at least two classes in the target variable
if len(np.unique(y)) > 1:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert text data to TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train a logistic regression model
    clf = LogisticRegression()
    clf.fit(X_train_tfidf, y_train)

    # Make predictions and evaluate the model
    y_pred = clf.predict(X_test_tfidf)
    print(classification_report(y_test, y_pred))
else:
    print("Not enough classes in the target variable for classification.")


In [None]:
# Predict the Time to resolve based on other features
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Prepare the data for predictive modeling
X = data.drop(columns=['Time to resolve', 'Date'])  # Drop the target and any non-numeric columns
X = pd.get_dummies(X, drop_first=True)  # Convert categorical variables to dummy variables
y = data['Time to resolve']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


In [None]:
# Anomaly Detection in Text
from sklearn.ensemble import IsolationForest

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['Customer description'])

# Fit the Isolation Forest model
clf = IsolationForest(random_state=42)
data['Anomaly'] = clf.fit_predict(X)

# Display the number of anomalies
print(data['Anomaly'].value_counts())

# Display some examples of anomalies
print(data[data['Anomaly'] == -1][['Customer description', 'Anomaly']].head())


In [None]:
# Sentiment Analysis by Product Type
plt.figure(figsize=(12, 6))
sns.boxplot(x='Product type', y='Sentiment', data=data)
plt.title('Sentiment Analysis by Product Type')
plt.xlabel('Product Type')
plt.ylabel('Sentiment')
plt.xticks(rotation=45)
plt.show()

# Sentiment Analysis by Reason for Contact
plt.figure(figsize=(12, 6))
sns.boxplot(x='Reason for contact', y='Sentiment', data=data)
plt.title('Sentiment Analysis by Reason for Contact')
plt.xlabel('Reason for Contact')
plt.ylabel('Sentiment')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Compare text length between resolved and unresolved cases
resolved_cases = data[data['Resolution code'] != 'Unresolved']
unresolved_cases = data[data['Resolution code'] == 'Unresolved']

plt.figure(figsize=(12, 6))
sns.histplot(resolved_cases['Description Length'], color='blue', label='Resolved', kde=True)
sns.histplot(unresolved_cases['Description Length'], color='red', label='Unresolved', kde=True)
plt.title('Comparison of Description Lengths between Resolved and Unresolved Cases')
plt.xlabel('Description Length')
plt.ylabel('Count')
plt.legend()
plt.show()


In [None]:
# Check the distribution of 'Resolution code'
# Define resolution codes that are considered "resolved". The function needs a binary value
resolved_codes = [
    "Upgraded Software / Engineering Hotfix", "Request for Enhancement", "Bugzilla ID existing",
    "Updated Configuration", "Resolved by another group", "Duplicate", "Bugzilla ID New",
    "Applied Workaround", "Updated License", "Customer Resolved", "Updated Account",
    "Educated Customer"
]

# Prepare data for text classification
X = data['Customer description']
y = data['Resolution code'].apply(lambda x: 1 if x in resolved_codes else 0)  # Binary classification: Resolved vs. Unresolved

# Check the distribution of the target variable
print(y.value_counts())

# Ensure there are at least two classes in the target variable
if len(np.unique(y)) > 1:
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert text data to TF-IDF features
    vectorizer = TfidfVectorizer(stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train a logistic regression model
    clf = LogisticRegression()
    clf.fit(X_train_tfidf, y_train)

    # Make predictions and evaluate the model
    y_pred = clf.predict(X_test_tfidf)
    print(classification_report(y_test, y_pred))
else:
    print("Not enough classes in the target variable for classification.")

In [None]:
# Time Series Analysis
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

# Resample the data by month and calculate the mean 'Time to resolve'
monthly_data = data['Time to resolve'].resample('ME').mean()

# Plot the time series
plt.figure(figsize=(12, 6))
monthly_data.plot()
plt.title('Average Time to Resolve Over Time')
plt.xlabel('Date')
plt.ylabel('Time to Resolve')
plt.show()


In [None]:
from sklearn.cluster import KMeans

# Select numerical columns for clustering
numerical_data = data.select_dtypes(include=[np.number])

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=0).fit(numerical_data)
data['Cluster'] = kmeans.labels_

# Plot the clusters
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Time to resolve', y='Cause code', hue='Cluster', data=data, palette='viridis')
plt.title('Clustering of Cases')
plt.show()


In [None]:
# Analyzing Relationships with Knowledge Base Article

# Check the distribution of knowledge base articles
kb_article_distribution = data['Knowledge base article if used'].value_counts()
print("Distribution of Knowledge Base Articles:")
print(kb_article_distribution)

# Find relationships between KB articles and other fields
# Group by 'Knowledge base article if used' and calculate the mean of numerical fields
numeric_columns = data.select_dtypes(include=[np.number]).columns
kb_article_relationships = data.groupby('Knowledge base article if used')[numeric_columns].mean()
print("\nRelationships between KB Articles and Numerical Fields:")
print(kb_article_relationships)

# Visualize the relationship between KB articles and 'Time to resolve'
plt.figure(figsize=(12, 6))
sns.boxplot(x='Knowledge base article if used', y='Time to resolve', data=data)
plt.title('Box Plot of Time to Resolve by Knowledge Base Article')
plt.xlabel('Knowledge Base Article')
plt.ylabel('Time to Resolve')
plt.show()

# Analyze the relationships with categorical fields
categorical_fields = ['Reason for contact', 'Cause code', 'Resolution code', 'Product type']
for field in categorical_fields:
    plt.figure(figsize=(12, 6))
    sns.countplot(x=field, hue='Knowledge base article if used', data=data)
    plt.title(f'Distribution of {field} by Knowledge Base Article')
    plt.xlabel(field)
    plt.ylabel('Count')
    plt.legend(title='KB Article')
    plt.show()
    
# Frequency Analysis of Knowledge Base Articles
plt.figure(figsize=(12, 6))
sns.countplot(y='Knowledge base article if used', data=data, order=data['Knowledge base article if used'].value_counts().index)
plt.title('Frequency of Knowledge Base Articles')
plt.xlabel('Count')
plt.ylabel('Knowledge Base Article')
plt.show()

# Resolution Effectiveness Analysis
plt.figure(figsize=(12, 6))
sns.boxplot(x='Knowledge base article if used', y='Time to resolve', data=data)
plt.title('Resolution Time by Knowledge Base Article')
plt.xlabel('Knowledge Base Article')
plt.ylabel('Time to Resolve')
plt.xticks(rotation=45)
plt.show()

# Calculate average resolution time with and without KB articles
with_kb = data[data['Knowledge base article if used'] != 'None']['Time to resolve'].mean()
without_kb = data[data['Knowledge base article if used'] == 'None']['Time to resolve'].mean()
print(f"Average resolution time with KB: {with_kb}")
print(f"Average resolution time without KB: {without_kb}")



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Select only numerical columns for modeling
numerical_data = data.select_dtypes(include=[np.number])

# Check if numerical_data is empty or contains missing values
print(numerical_data.head())
print(numerical_data.isnull().sum())

# Prepare the data
X = numerical_data.drop('Time to resolve', axis=1, errors='ignore')
y = numerical_data['Time to resolve']

# Ensure X and y are not empty
if X.empty or y.empty:
    raise ValueError("The input data X or y is empty. Please check the data preparation steps.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


In [None]:
# Feature Engineering: Extract month and day from Date
data['Month'] = data.index.month
data['Day'] = data.index.day
data['Day of Week'] = data.index.dayofweek

# Display the updated DataFrame
print(data.head())


In [None]:
from sklearn.model_selection import cross_val_score

# Evaluate the model using cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {-scores.mean()}')


In [None]:
from sklearn.model_selection import cross_val_score

# Evaluate the model using cross-validation
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {-scores.mean()}')


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Perform GridSearchCV
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Display the best parameters
print(f'Best Parameters: {grid_search.best_params_}')


In [None]:
# Load the spaCy model for NER and word vectors
nlp = spacy.load('en_core_web_md')

# Load the data from values.csv
data = pd.read_csv('values.csv')

# Data Preparation
X = data.drop(columns=['Time to resolve', 'Date'])  # Drop the target and any non-numeric columns
X = pd.get_dummies(X, drop_first=True)  # Convert categorical variables to dummy variables
feature_names = X.columns  # Capture feature names before transforming to sparse matrix
y = data['Time to resolve']

# Predictive Modeling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Feature Importance
importances = model.feature_importances_
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances.head(10))
plt.title('Top 10 Important Features')
plt.show()

In [None]:
from sklearn.ensemble import IsolationForest

# Fit the model
iso_forest = IsolationForest(contamination=0.1)
data['Anomaly'] = iso_forest.fit_predict(numerical_data)

# Plot the anomalies
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Time to resolve', y='Cause code', hue='Anomaly', data=data, palette='viridis')
plt.title('Anomaly Detection')
plt.show()


In [None]:
from textblob import TextBlob

# Apply sentiment analysis on 'Customer description'
data['Sentiment'] = data['Customer description'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Plot sentiment scores
plt.figure(figsize=(12, 6))
sns.histplot(data['Sentiment'], bins=10, kde=True)
plt.title('Sentiment Analysis of Customer Descriptions')
plt.show()


In [None]:
import plotly.express as px
import plotly.io as pio

# Initialize Plotly with a different renderer if needed
pio.renderers.default = 'notebook'

# Reset the index to ensure Plotly can interpret the DataFrame correctly
data_reset = data.reset_index()

# Print the data to ensure it's correctly formatted
print(data_reset.head())

# Create an interactive scatter plot
fig = px.scatter(data_reset, x='Time to resolve', y='Cause code', color='Reason for contact',
                 title='Interactive Scatter Plot of Time to Resolve by Cause Code')

# Display the figure using show method with different renderer options
fig.show(renderer='notebook')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Pair plot to visualize relationships between variables
sns.pairplot(data_reset, hue='Reason for contact')
plt.suptitle('Pair Plot of Key Variables', y=1.02)
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Heatmap of missing values
plt.figure(figsize=(12, 6))
sns.heatmap(data_reset.isnull(), cbar=False, cmap='viridis')
plt.title('Heatmap of Missing Values')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Violin plot to visualize the distribution of 'Time to resolve' by 'Reason for contact'
plt.figure(figsize=(12, 6))
sns.violinplot(x='Reason for contact', y='Time to resolve', data=data_reset, palette='muted')
plt.title('Violin Plot of Time to Resolve by Reason for Contact')
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
le_cause_code = LabelEncoder()
le_reason_for_contact = LabelEncoder()

data_reset['Cause code'] = le_cause_code.fit_transform(data_reset['Cause code'])
data_reset['Reason for contact'] = le_reason_for_contact.fit_transform(data_reset['Reason for contact'])

# Train a decision tree classifier
X = data_reset[['Cause code', 'Time to resolve']]
y = data_reset['Reason for contact']
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X, y)

# Plot the decision tree
plt.figure(figsize=(20, 10))
plot_tree(clf, filled=True, feature_names=['Cause code', 'Time to resolve'], class_names=le_reason_for_contact.classes_)
plt.title('Decision Tree Visualization')
plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Prepare the text data
text_data = data_reset['Customer description'].tolist()
vectorizer = CountVectorizer(stop_words='english')
X_text = vectorizer.fit_transform(text_data)

# Fit LDA model
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X_text)

# Display the top words for each topic
num_words = 10
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]))


In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
from sklearn.preprocessing import LabelEncoder

# Encode 'Cause code' as numeric values for plotting
le_cause_code = LabelEncoder()
data_reset['Cause code'] = le_cause_code.fit_transform(data_reset['Cause code'])

# Prepare the data
source = ColumnDataSource(data_reset)

# Create a Bokeh plot
output_notebook()
p = figure(title='Interactive Plot of Time to Resolve by Cause Code', x_axis_label='Time to resolve', y_axis_label='Cause code')

# Use scatter method instead of circle to avoid deprecation warning
p.scatter('Time to resolve', 'Cause code', source=source, size=10, color='navy', alpha=0.5)

# Show the plot
show(p)


In [None]:
#  Clustering with t-SNE
# Applies t-SNE for dimensionality reduction and visualizes the clusters.

from sklearn.manifold import TSNE
import seaborn as sns

# Prepare the data for t-SNE
tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(numerical_data)

# Create a DataFrame for the t-SNE results
tsne_df = pd.DataFrame(tsne_results, columns=['TSNE1', 'TSNE2'])
tsne_df['Reason for contact'] = data_reset['Reason for contact']

# Plot the t-SNE results
plt.figure(figsize=(12, 6))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='Reason for contact', data=tsne_df, palette='viridis')
plt.title('t-SNE Clustering of Cases')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import parallel_coordinates
from sklearn.preprocessing import LabelEncoder

# Prepare the data for parallel coordinates plot
parallel_data = data[['Reason for contact', 'Time to resolve', 'Cause code', 'Product type']].copy()

# Encode categorical variables
le_reason = LabelEncoder()
le_product = LabelEncoder()
le_cause_code = LabelEncoder()

parallel_data['Reason for contact'] = le_reason.fit_transform(parallel_data['Reason for contact'])
parallel_data['Product type'] = le_product.fit_transform(parallel_data['Product type'])
parallel_data['Cause code'] = le_cause_code.fit_transform(parallel_data['Cause code'])

# Plot the parallel coordinates plot
plt.figure(figsize=(15, 7))
parallel_coordinates(parallel_data, 'Reason for contact', colormap=plt.get_cmap("Set2"))
plt.title('Parallel Coordinates Plot')
plt.show()


In [None]:
import plotly.graph_objects as go

# Prepare the data for radial bar chart
radial_data = data.groupby('Product type')['Time to resolve'].mean().reset_index()

# Print the radial_data to debug
print(radial_data)

# Create a radial bar chart
fig = go.Figure(go.Barpolar(
    r=radial_data['Time to resolve'],
    theta=[f"Type {i+1}" for i in range(len(radial_data))],  # Ensure theta is categorical
    marker_color=['#ff6361', '#bc5090', '#ffa600'],
    marker_line_color="black",
    marker_line_width=2,
    opacity=0.8
))

fig.update_layout(
    title='Radial Bar Chart of Time to Resolve by Product Type',
    polar=dict(
        radialaxis=dict(showticklabels=True, ticks=''),
    ),
    showlegend=False
)

fig.show()


In [None]:
# PCA example
# PCA is useful for dimensionality reduction and visualization of high-dimensional data.
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare the data for PCA
pca = PCA(n_components=2)
pca_results = pca.fit_transform(numerical_data)

# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(pca_results, columns=['PCA1', 'PCA2'])
pca_df['Reason for contact'] = data['Reason for contact']

# Plot the PCA results
plt.figure(figsize=(12, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Reason for contact', data=pca_df)
plt.title('PCA of Cases')
plt.show()


In [None]:
# analyzing relationships with Bugzilla IDs

# Frequency Analysis of Bugzilla IDs
plt.figure(figsize=(12, 6))
sns.countplot(y='Bugzilla ID if found', data=data, order=data['Bugzilla ID if found'].value_counts().index)
plt.title('Frequency of Bugzilla IDs')
plt.xlabel('Count')
plt.ylabel('Bugzilla ID')
plt.show()

# Resolution Effectiveness Analysis
plt.figure(figsize=(12, 6))
sns.boxplot(x='Bugzilla ID if found', y='Time to resolve', data=data)
plt.title('Resolution Time by Bugzilla ID')
plt.xlabel('Bugzilla ID')
plt.ylabel('Time to Resolve')
plt.xticks(rotation=45)
plt.show()

# Calculate average resolution time with and without Bugzilla IDs
with_bz = data[data['Bugzilla ID if found'] != 'None']['Time to resolve'].mean()
without_bz = data[data['Bugzilla ID if found'] == 'None']['Time to resolve'].mean()
print(f"Average resolution time with Bugzilla ID: {with_bz}")
print(f"Average resolution time without Bugzilla ID: {without_bz}")

# Categorical Relationships with Bugzilla IDs
categorical_fields = ['Reason for contact', 'Cause code', 'Resolution code', 'Product type']
for field in categorical_fields:
    plt.figure(figsize=(12, 6))
    sns.countplot(x=field, hue='Bugzilla ID if found', data=data)
    plt.title(f'Distribution of {field} by Bugzilla ID')
    plt.xlabel(field)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.legend(title='Bugzilla ID')
    plt.show()

# Bugzilla ID and Product Type
plt.figure(figsize=(12, 6))
sns.countplot(x='Product type', hue='Bugzilla ID if found', data=data)
plt.title('Distribution of Product Type by Bugzilla ID')
plt.xlabel('Product Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Bugzilla ID')
plt.show()

# Bugzilla ID and Resolution Code
plt.figure(figsize=(12, 6))
sns.countplot(x='Resolution code', hue='Bugzilla ID if found', data=data)
plt.title('Distribution of Resolution Code by Bugzilla ID')
plt.xlabel('Resolution Code')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Bugzilla ID')
plt.show()

# Bugzilla ID and Cause Code
plt.figure(figsize=(12, 6))
sns.countplot(x='Cause code', hue='Bugzilla ID if found', data=data)
plt.title('Distribution of Cause Code by Bugzilla ID')
plt.xlabel('Cause Code')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.legend(title='Bugzilla ID')
plt.show()

# Time to Resolve by Bugzilla ID
plt.figure(figsize=(12, 6))
sns.boxplot(x='Bugzilla ID if found', y='Time to resolve', data=data)
plt.title('Time to Resolve by Bugzilla ID')
plt.xlabel('Bugzilla ID')
plt.ylabel('Time to Resolve')
plt.xticks(rotation=45)
plt.show()

# Calculate average resolution time for each Bugzilla ID
avg_res_time_bz = data.groupby('Bugzilla ID if found')['Time to resolve'].mean().reset_index()
avg_res_time_bz.columns = ['Bugzilla ID', 'Average Time to Resolve']
display(avg_res_time_bz)

# Cluster Analysis with KMeans
# Select features for clustering
features = ['Time to resolve', 'Cause code', 'Resolution code']
data_cluster = data[features].copy()

# Encode categorical features
for column in features:
    if data_cluster[column].dtype == 'object':
        le = LabelEncoder()
        data_cluster[column] = le.fit_transform(data_cluster[column])

# Perform KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster'] = kmeans.fit_predict(data_cluster)

plt.figure(figsize=(12, 6))
sns.scatterplot(x='Time to resolve', y='Cause code', hue='Cluster', data=data, palette='Set2')
plt.title('Cluster Analysis of Bugzilla IDs')
plt.xlabel('Time to Resolve')
plt.ylabel('Cause Code')
plt.legend(title='Cluster')
plt.show()

# Sentiment Analysis of Customer Descriptions for Bugzilla IDs
data['Sentiment'] = data['Customer description'].apply(lambda x: TextBlob(x).sentiment.polarity)

plt.figure(figsize=(12, 6))
sns.boxplot(x='Bugzilla ID if found', y='Sentiment', data=data)
plt.title('Sentiment of Customer Descriptions by Bugzilla ID')
plt.xlabel('Bugzilla ID')
plt.ylabel('Sentiment')
plt.xticks(rotation=45)
plt.show()


# Simulating a Customer Feedback column for this analysis
np.random.seed(42)
customer_feedback = np.random.choice(['Positive', 'Neutral', 'Negative'], size=data.shape[0])
data.loc[:, 'Customer Feedback'] = customer_feedback

# Customer Feedback by Bugzilla ID
plt.figure(figsize=(12, 6))
sns.countplot(x='Customer Feedback', hue='Bugzilla ID if found', data=data)
plt.title('Customer Feedback by Bugzilla ID')
plt.xlabel('Customer Feedback')
plt.ylabel('Count')
plt.legend(title='Bugzilla ID')
plt.show()




# Decision Tree to Understand Factors Leading to Bugzilla ID
X = data[['Time to resolve', 'Cause code', 'Resolution code', 'Product type']]
y = data['Bugzilla ID if found']

# Encode categorical features
for column in X.columns:
    if X[column].dtype == 'object':
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X, y)

plt.figure(figsize=(20, 10))
plot_tree(clf, filled=True, feature_names=X.columns, class_names=clf.classes_)
plt.title('Decision Tree to Understand Factors Leading to Bugzilla ID')
plt.show()


