Analyze resident submitted move in work order description, identify common issues and patterns, prioritize urgent problems. 

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import  matplotlib.pyplot as plt
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from textblob import TextBlob
# from rake_nltk import Rake
# from keybert import KeyBERT

In [None]:
nltk.download('punkt')

In [None]:
Pro_ALT_UP = pd.read_excel("UTD_2024_Propertyware_FS.xlsx", sheet_name="Updated_Pro")
Pro_ALT_UP = pd.DataFrame(Pro_ALT_UP)
Pro_ALT_UP.head()

In [None]:
Pro_ALT_UP["Description"] = Pro_ALT_UP["Description"].fillna('').astype(str)

### Step1: Prepare Text

In [None]:
# des = Pro_ALT_UP['Description'].dropna()

In [None]:
des = Pro_ALT_UP['Description']

In [None]:
des = des.str.lower()

In [None]:
des = des.str.replace('[^\w\s]', '', regex=True) # remove punctuation

In [None]:
des = des.str.replace('\d+', '', regex=True) # remove numbers like 1. 2. ..

In [None]:
Pro_ALT_UP["Description"] = Pro_ALT_UP["Description"].fillna('').astype(str)

In [None]:
des_tokens = des.apply(lambda x: x.split()) # Tokenize

In [None]:
# nlp = spacy.load("en_core_web_sm")
# des_tokens = des.apply(lambda x: [token.text for token in nlp(x)])

In [None]:
irrelevant_words = {'need', 'one', 'working','work', 'issue', 'problem','fix', 'require','also','yes','please','coming','open'}
stop_words = ENGLISH_STOP_WORDS.union(irrelevant_words) # remove common stop words like "the", "is", "and"..
des_tokens = des_tokens.apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
lemmatizer = WordNetLemmatizer() # get words back to root
des_tokens = des_tokens.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

### Step 2: Word Freq Analysis

In [None]:
all_words = [word for tokens in des_tokens for word in tokens]
word_freq = Counter(all_words)

In [None]:
top_words = word_freq.most_common(20)
for word, freq in top_words:
    print(f"{word}:{freq}")

In [None]:
# Vis
words, freqs = zip(*top_words)
plt.figure(figsize=(10,6))
plt.bar(words, freqs)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 20 Most Freq Words in WO Des')
plt.xticks(rotation=45)
plt.show()

### Step 3: Phrase Analysis (N-grams)

In [None]:
# Generate Bigrams
def generate_ngrams(tokenized_description, n=2):
    all_ngrams = []
    for tokens in tokenized_description:
        all_ngrams.extend(list(ngrams(tokens, n)))
    return all_ngrams

bigrams = generate_ngrams(des_tokens, n=2)

In [None]:
bigrams

In [None]:
trigrams = generate_ngrams(des_tokens, n=3)

In [None]:
bigrams_freq = Counter(bigrams)
trigrams_freq = Counter(trigrams)

In [None]:
# Show most common ones (Bi)
top_bigrams = bigrams_freq.most_common(10)
print("Top 10 Bigrams:")
for bigram, freq in top_bigrams:
    print(f"{bigram}:{freq}")

In [None]:
# Show most common ones (tri)
top_trigrams = trigrams_freq.most_common(10)
print("Top 10 Trigrams:")
for trigram, freq in top_trigrams:
    print(f"{trigram}:{freq}")

In [None]:
# Vis
bigram_words, bigram_counts = zip(*top_bigrams)
bigram_words = [' '.join(bigram) for bigram in bigram_words]

plt.figure(figsize=(10,6))
plt.bar(bigram_words, bigram_counts)
plt.xlabel('Bigram')
plt.ylabel('Frequency')
plt.title('Top 10 Bigrams')
plt.xticks(rotation=45)
plt.show()

In [None]:
# I think things related with pet are not useful
pet_keywords = {'pet','dog','cat','pets','animal','puppy','kitten','pte','just','moved'}
def filter_ngrams(ngrams, pet_keywords):
    return [(ngram, freq) for ngram, freq in ngrams if not any(word in pet_keywords for word in ngram)]

filtered_bigrams = filter_ngrams(bigrams_freq.most_common(), pet_keywords)
filter_trigrams = filter_ngrams(trigrams_freq.most_common(), pet_keywords)

In [None]:
filtered_bigrams

In [None]:
filter_trigrams

In [None]:
# Show most common ones (Bi)
# Without Pet
top_f_bigrams = filtered_bigrams[:10]
print("\nTop 10 Filtered Bigrams:")
for bigram, freq in top_f_bigrams:
    print(f"{bigram}:{freq}")

In [None]:
# Show most common ones (Bi)
top_f_trigrams = filter_trigrams[:10]
print("\nTop 10 Filtered Trigrams:")
for trigram, freq in top_f_trigrams:
    print(f"{trigram}:{freq}")

In [None]:
# Vis
f_bigram_words, f_bigram_counts = zip(*top_f_bigrams)
f_bigram_words = [' '.join(bigram) for bigram in f_bigram_words]

plt.figure(figsize=(10,6))
plt.bar(f_bigram_words, f_bigram_counts)
plt.xlabel('Filtered Bigram')
plt.ylabel('Frequency')
plt.title('Top 10 Filtered Bigrams')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Vis
f_trigram_words, f_trigram_counts = zip(*top_f_trigrams)
f_trigram_words = [' '.join(trigram) for trigram in f_trigram_words]

plt.figure(figsize=(20,6))
plt.bar(f_trigram_words, f_trigram_counts)
plt.xlabel('Filtered Trigram')
plt.ylabel('Frequency')
plt.title('Top 10 Filtered Trigrams')
plt.xticks(rotation=45)
plt.show()

### Step 4: Topic Modeling

Group words often shown together into one group

In [None]:
print(stop_words)

In [None]:
cleaned_des = des_tokens.apply(lambda x: ' '.join(x))
# default_stop = ENGLISH_STOP_WORDS
# custom_stop = {'need', 'one', 'working','work', 'issue', 'problem','fix', 'require','pte','pet'}
# c_stop_words = default_stop.union(custom_stop)

vectorizer = CountVectorizer(max_df=0.85, min_df=2, stop_words = 'english')
doc_term_matrix = vectorizer.fit_transform(cleaned_des)

In [None]:
Pro_ALT_UP['Description']

In [None]:
# Apply LDA
num_topic = 3
lda_model = LatentDirichletAllocation(n_components=num_topic, random_state=42)
lda_model.fit(doc_term_matrix)

In [None]:
feature_names = vectorizer.get_feature_names_out()

# Define function to display topics
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx+1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))
        
display_topics(lda_model, feature_names, n_top_words=10) # display top 10 words for each topic

In [None]:
# Assign topics to des
topic_assign = lda_model.transform(doc_term_matrix)

Pro_ALT_UP["Assigned Topic"] = topic_assign.argmax(axis=1) +1
print(Pro_ALT_UP[['Description','Assigned Topic']].head())

In [None]:
topic_prop = lda_model.transform(doc_term_matrix).mean(axis=0)
plt.figure(figsize=(10,6))
plt.bar(range(1, num_topic+1), topic_prop)
plt.xlabel('Topic Number')
plt.ylabel('Proportion of Des')
plt.title('Topic Proportions Across All Des')
plt.xticks(range(1, num_topic+1))
plt.show()

In [None]:
# create wordcloud for topic
for topic_idx, topic in enumerate(lda_model.components_):
    topic_words = ' '.join([feature_names[i] for i in topic.argsort()[:-50-1:-1]])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(topic_words)
    plt.figure()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Topic{topic_idx + 1}")
    plt.show()

### Step 5: Clustering Similar Issues

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(cleaned_des)

In [None]:
num_cluster = 5
kmeans = KMeans(n_clusters=num_cluster, random_state=42)
kmeans.fit(tfidf_matrix)
cluster_labels = kmeans.labels_

In [None]:
Pro_ALT_UP['Cluster'] = cluster_labels

In [None]:
# See des for each cluster
for cluster in range(num_cluster):
    print(f"\nCluster {cluster}:")
    print(Pro_ALT_UP[Pro_ALT_UP['Cluster'] ==  cluster]['Description'].head(10).values)

In [None]:
# top words in each cluster
for cluster in range(num_cluster):
    cluster_des = cleaned_des[Pro_ALT_UP['Cluster'] == cluster]
    cluster_words = " ".join(cluster_des).split()
    print(f"\nCluster {cluster} Common Words:")
    print(Counter(cluster_words).most_common(10))

In [None]:
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())
plt.figure(figsize=(10,6))
plt.scatter(reduced_data[:,0], reduced_data[:, 1], c=cluster_labels, alpha=0.7)
plt.colorbar()
plt.title('Scatter Plot of Cluster')
plt.xlabel("PCA of Dim 1")
plt.ylabel("PCA of Dim 2")
plt.show()

In [None]:
cluster_size = Pro_ALT_UP['Cluster'].value_counts()
cluster_size.plot(kind='bar', figsize=(10,6))
plt.title("Cluster Size")
plt.xlabel("Cluster")
plt.ylabel("Number of Des")
plt.show()

### Step 6: Emotional Study

In [None]:
def get_sen(description):
    analysis = TextBlob(description)
    return analysis.sentiment.polarity

Pro_ALT_UP['Sentiment'] = cleaned_des.apply(get_sen)

In [None]:
negative_issues = Pro_ALT_UP[Pro_ALT_UP['Sentiment'] < 0]
print("Top Negative Issues:")
print(negative_issues[['Description', 'Sentiment']].sort_values(by='Sentiment').head(10))

In [None]:
# Vis
plt.figure(figsize=(10,6))
Pro_ALT_UP['Sentiment'].hist(bins=20, color='skyblue')
plt.title('Sentiment Score Dis')
plt.xlabel('Sentiment Score')
plt.ylabel('Number of Des')
plt.show()

In [None]:
# Add flag based on negative sentiment
Pro_ALT_UP['Priority'] = Pro_ALT_UP['Sentiment'].apply(lambda x: 'High' if x < -0.5 else('Medium' if x < 0 else 'Low'))
high_priority = Pro_ALT_UP[Pro_ALT_UP['Priority'] == 'High']
print(high_priority[['Description','Sentiment','Priority']])

In [None]:
Pro_ALT_UP['Month'] = pd.to_datetime(Pro_ALT_UP['Date Created']).dt.month # get months out
monthly_sent = Pro_ALT_UP.groupby('Month')['Sentiment'].mean()
monthly_sent.plot(kind='line', figsize=(10,6))
plt.title('Average Sentiment Over Time')
plt.xlabel('Month')
plt.ylabel('Average Sentiment Score')
plt.show()

### Step 7: Key Words for Each

In [None]:
kw_model = KeyBERT()
def extract_kw_keybert(description):
    keywords = kw_model.extract_keywords(des, top_n=5)
    return [kw[0] for kw in keywords]

Pro_ALT_UP['KeyBERT'] = cleaned_des.apply(lambda x: extract_kw_keybert(x))
print(Pro_ALT_UP[['Description', 'KeyBERT']].head())