In [1]:
import pandas as pd
import numpy as np
import operator
import warnings
import re
warnings.filterwarnings('ignore')

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from statistics import mean 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score,jaccard_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import RepeatedKFold 
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to /Users/timur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/timur/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Извлечение данных

In [2]:
app_list = ["facebook", "swiftkey", "tapfish", "templerun2"]
labeled_df_path ="Diplom/{}_labeled.csv"
unlabeled_df_path ="Diplom/{}_unlabeled.csv"

TEXT = "text"
RATING = "rating"
INFORMATIVE = "informative"

DIVIDER = "\n-----------------------------------------------------------------\n"

In [3]:
def get_labeled_df_from_app(app):
    return pd.read_csv(labeled_df_path.format(app))

def get_unlabeled_df_from_app(app):
    return pd.read_csv(unlabeled_df_path.format(app))

def get_all_apps_labeled_df():
    sum_df = pd.DataFrame(columns=[TEXT, RATING, INFORMATIVE])
    for app in app_list:
        l_df = get_labeled_df_from_app(app)
        sum_df = pd.concat([
            sum_df,l_df
        ],ignore_index=True)
    return sum_df

def get_all_apps_unlabeled_df():
    sum_df = pd.DataFrame(columns=[TEXT, RATING])
    for app in app_list:
        u_df = get_unlabeled_df_from_app(app)
        sum_df = pd.concat([
            sum_df,u_df
        ],ignore_index=True)
    return sum_df

# Фильтрация

## Предобработка данных

In [4]:
def preprocessing(X):
    documents = []
    stemmer = WordNetLemmatizer()
    for sen in range(0, len(X)):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(X[sen]))
    
        # Remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        document = document.split()

        document = [stemmer.lemmatize(word) for word in document]
        document = ' '.join(document)

        documents.append(document)
    return documents

## Векторизация

In [5]:
MAX_FEATURES = 1500
MIN_DF = 5
MAX_DF = 0.7

tfidf_vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, min_df=MIN_DF, max_df=MAX_DF, stop_words=stopwords.words('english'))
count_vectorizer = CountVectorizer(max_features=MAX_FEATURES, min_df=MIN_DF, max_df=MAX_DF, stop_words=stopwords.words('english'))

def fit_count_vectorizer(vectorizer, raw_documents):
    clean_documents = preprocessing(raw_documents)
    count_vectorizer.fit(clean_documents)
    return clean_documents
    
def get_tf(clean_documents):
    tf = count_vectorizer.transform(clean_documents)
    return tf

def fit_tfidf_vectorizer(raw_documents):
    clean_documents = preprocessing(raw_documents)
    tfidf_vectorizer.fit(clean_documents)
    return clean_documents
    
def get_tfidf(clean_documents):
    tfidf = tfidf_vectorizer.transform(clean_documents)
    return tfidf


In [6]:
def get_tf_feature_names():
    return count_vectorizer.get_feature_names()


def get_tfidf_feature_names():
    return tfidf_vectorizer.get_feature_names()


## Сравнение классификаторов

In [7]:
default_classifiers = [
    LogisticRegression(),
    LinearSVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]

In [8]:
def print_score_by_classifier(classifier, score):
    print("----- {} ---- \n".format(classifier.__class__.__name__))
    print('F1 score:', score)
    print("\n")

In [9]:
def get_f1_scores_cross_validation(classifier, X, y):
    scores = []
    cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=None) 
    for train_index, test_index in cv.split(X):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        classifier.fit(X_train, y_train)
        Y_pred = classifier.predict(X_test)
        score = f1_score(y_test, Y_pred)
        scores.append(score)
        
    return scores

In [10]:
def get_X_Y():
    df = get_all_apps_labeled_df()   
    raw_reviews = df[TEXT]
    clean_reviews = fit_tfidf_vectorizer(raw_reviews)
    review_features = get_tfidf(clean_reviews)
    X = review_features
    Y = df[INFORMATIVE].astype('int')
    return X, Y

In [11]:
def print_classifiers_comparison_report(classifiers):
    X, Y = get_X_Y()
    classifier_scores = []
    for classifier in classifiers:
        scores_by_classifier = get_f1_scores_cross_validation(classifier, X, Y)
        mean_score = mean(scores_by_classifier)
        print_score_by_classifier(classifier, mean_score)
        classifier_scores.append(mean_score)
        
    max_index, max_score = max(enumerate(classifier_scores), key=operator.itemgetter(1))
    return classifiers[max_index], max_score

## Полученные результаты

In [12]:
best_classifier, max_score = print_classifiers_comparison_report(default_classifiers)
print("----- Best classifier: {} with F1-score {}".format(
                                                best_classifier.__class__.__name__,
                                                max_score,
                                                ))

----- LogisticRegression ---- 

F1 score: 0.8198256670403109


----- LinearSVC ---- 

F1 score: 0.8407720280211066


----- DecisionTreeClassifier ---- 

F1 score: 0.7946606711745674


----- RandomForestClassifier ---- 

F1 score: 0.8131067165228496


----- GradientBoostingClassifier ---- 

F1 score: 0.6943551913858309


----- Best classifier: LinearSVC with F1-score 0.8407720280211066


### Настроим параметры для RandomForestClassifier и GradientBoostingClassifier

In [13]:
def get_best_params_from_random_forest():
    estimator = RandomForestClassifier()
    param_grid = { 
                "n_estimators"      : [100,200,300],
                "max_features"      : ["auto", "sqrt", "log2"],
                "min_samples_split" : [2,4,8],
                "bootstrap": [True, False],
    }
    grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=3)
    
    X, Y = get_X_Y()
    grid.fit(X, Y)
    return grid.best_params_["n_estimators"], grid.best_params_["max_features"], grid.best_params_["min_samples_split"], grid.best_params_["bootstrap"], 
    
def get_best_random_forest():
    n_estimators, max_features, min_samples_split, bootstrap = get_best_params_from_random_forest()
    return RandomForestClassifier(
        n_estimators = n_estimators,
        max_features = max_features, 
        min_samples_split = min_samples_split,
        bootstrap = bootstrap
    )

In [14]:
def get_best_gradient_boosting_params():
    estimator = GradientBoostingClassifier()
    param_grid = {
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_depth': [4, 6, 8],
    'min_samples_leaf': [20, 50,100,150]
    }

    grid = GridSearchCV(estimator, param_grid, cv=3, n_jobs=-1)
    
    X, Y = get_X_Y()
    grid.fit(X, Y)
    return grid.best_params_["learning_rate"], grid.best_params_["max_depth"], grid.best_params_["min_samples_leaf"]
    

def get_best_gradient_boosting_classifier():
    learning_rate, max_depth, min_samples_leaf = get_best_gradient_boosting_params()
    return GradientBoostingClassifier(
        learning_rate = learning_rate,
        max_depth = max_depth,
        min_samples_leaf = min_samples_leaf
    )


In [15]:
tuned_classifiers = [
    LogisticRegression(),
    LinearSVC(),
    DecisionTreeClassifier(),
    get_best_random_forest(),
    get_best_gradient_boosting_classifier()
]

## Новые результаты

In [16]:
best_classifier, max_score = print_classifiers_comparison_report(tuned_classifiers)
print("----- Best classifier: {} with F1-score {}".format(
                                                best_classifier.__class__.__name__,
                                                max_score,
                                                ))

----- LogisticRegression ---- 

F1 score: 0.8188646890812574


----- LinearSVC ---- 

F1 score: 0.8419225660256778


----- DecisionTreeClassifier ---- 

F1 score: 0.7942819573645123


----- RandomForestClassifier ---- 

F1 score: 0.8439587844105666


----- GradientBoostingClassifier ---- 

F1 score: 0.7628969075602122


----- Best classifier: RandomForestClassifier with F1-score 0.8439587844105666


# Тематическая модель

### Получение датасета из неразмеченных данных с использованием классификатора

In [17]:
def predict_for_df(unlabeled_df, classifier):
    clean_reviews = preprocessing(unlabeled_df[TEXT])
    X_test = get_tfidf(clean_reviews)
    Y_pred = classifier.predict(X_test)
    return Y_pred
    
def filter_df(df):
    df = df[df[INFORMATIVE]==1]
    filtered_df = df.reset_index(drop=True)
    return filtered_df
    
def get_filtered_df_from_unlabeled_df(app_index, classifier, row_limit = -1):
    app = app_list[app_index]
    unlabeled_df = get_labeled_df_from_app(app)
    Y_pred = predict_for_df(unlabeled_df, classifier)
    
    predicted_df = unlabeled_df.copy()
    predicted_df[INFORMATIVE] = Y_pred
    
    filtered_df = filter_df(predicted_df)
    filtered_df = filtered_df.dropna()
    filtered_df = filtered_df.drop_duplicates(subset=TEXT)
    if row_limit != -1:
        return  filtered_df.sample(frac=1).reset_index(drop=True).head(row_limit)
    else:
        return filtered_df


In [18]:
filtered_df = get_filtered_df_from_unlabeled_df(
    app_index = 0, 
    classifier = best_classifier, 
    row_limit = 10000)

In [19]:
def get_filtered_df_from_labeled_df(app_index, row_limit = -1):
    app = app_list[app_index]
    labeled_df = get_labeled_df_from_app(app)

    filtered_df = filter_df(labeled_df)
    filtered_df = filtered_df.dropna()
    filtered_df = filtered_df.drop_duplicates(subset=TEXT)
    if row_limit != -1:
        return  filtered_df.sample(frac=1).reset_index(drop=True).head(row_limit)
    else:
        return filtered_df

In [20]:
filtered_df = get_filtered_df_from_labeled_df(
    app_index = 0, 
    row_limit = 10000)

### Подготовка входных данных для тематического моделирования

In [21]:
raw_reviews = filtered_df[TEXT].tolist()
clean_reviews = preprocessing(filtered_df[TEXT])
reviews_rating = filtered_df[RATING].astype('int')

## Выделение тем

#### 1) Создание LDA модели, из которой мы получаем матрицы весов и признаков
#### 2) Создание классов Тем и Отзывов из этих матриц
#### 3) Ранжирование топиков
#### 4) Ранжирование отзывов
###### 4.1) Удаление нерелевантных к теме отзывов
Нерелевантные отзывы не проходят порог веса к этой теме
###### 4.2) Удаление дублирующихся по смыслу отзывов
Дублирующие отзывы это те, которые по критерию превышают порог
#### 5) Показ результата
###### 5.1) Показ номера темы и её ключевых слов
###### 5.2) Показ топ-отзывов по этой теме

### LDA модель

In [22]:
FEATURES_NUMBER = 1500
TOPIC_NUMBER = 20

In [23]:
def get_lda_outputs(clean_reviews):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1500, stop_words=stopwords.words('english'))
    tf = vectorizer.fit_transform(clean_reviews)
    feature_names = vectorizer.get_feature_names()

    lda_model = LatentDirichletAllocation(
        n_components=TOPIC_NUMBER, max_iter=5, 
        learning_method='online', learning_offset=50.,random_state=0
    ).fit(tf)

    lda_W = lda_model.transform(tf)
    lda_H = lda_model.components_
    
    return lda_H, lda_W, feature_names

In [24]:
lda_H, lda_W, feature_names = get_lda_outputs(clean_reviews)

### Классы Тем и Отзывов

In [25]:
def get_jaccard_similarity(str1, str2):
    str1 = preprocessing([str1])
    str2 = preprocessing([str2])
    a = set(str1[0].split()) 
    b = set(str2[0].split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [26]:
class Review:
    review_id = 0
    text_review = ""
    
    proportion = 0
    proportion_feature_weight = 0
    
    rating = 0
    rating_feature_weight = 0
    
    duplicate_number = 0
    duplicate_feature_weight = 0
    
    review_score = 0
    
    def __init__(self, review_id, text_review, proportion, rating, 
                    proportion_feature_weight, rating_feature_weight, duplicate_feature_weight):
        self.review_id = review_id
        self.text_review = text_review
        self.proportion = proportion
        self.rating = rating
        self.duplicate_feature_weight = duplicate_feature_weight
        self.rating_feature_weight = rating_feature_weight
        self.proportion_feature_weight = proportion_feature_weight
        
    def calculate_review_score(self):
        self.review_score = self.duplicate_feature_weight * self.duplicate_number +\
                            self.proportion_feature_weight * self.proportion +\
                            self.rating_feature_weight * float(1)/self.rating
        

In [27]:
class Topic:
    topic_id = 0
    reviews = []
    volume = 0
    volume_feature_weight = 0 
    rating = 0
    rating_feature_weight = 0
    group_score = 0

    key_words = []
    top_reviews_indices = 0
    
    def __init__(self, topic_id, volume_feature_weight, rating_feature_weight):
        self.topic_id = topic_id
        self.volume_feature_weight = volume_feature_weight
        self.rating_feature_weight = rating_feature_weight
        
    def get_top_reviews(self, top_review_number = -1):
        top = top_review_number
        if top == -1:
            top = len(self.reviews)
        if top > len(self.reviews) - 1:
            top = len(self.reviews) - 1
        
        return self.reviews[0:top]
        
    def get_top_keywords(self, top_keywords_number = -1):
        top = top_keywords_number
        if top == -1:
            top = len(self.key_words)
        if top > len(self.key_words) - 1:
            top = len(self.key_words) - 1
        return self.key_words[0:top]
        
        return key_words[0:top_keywords_number]
    
    def eliminate_unrelated_reviews(self, alpha = 0.1):
        for review in self.reviews[:]:
            if review.proportion < alpha:
                self.reviews.remove(review)
                
                
    def calculate_volume(self):
        for review in self.reviews:
            self.volume += review.proportion
            
    def calculate_rating(self):
        denominator = 0
        for review in self.reviews:
            denominator += review.rating * review.proportion
        self.rating = float(self.volume) / denominator
        
    def calculate_group_score(self):
        self.group_score = self.volume_feature_weight * self.volume + self.rating_feature_weight * self.rating
    
    def remove_duplicates(self, beta):
        duplicate_ids_to_delete = []
        review_ids = self.get_review_ids()
        for i in range(0, len(review_ids)-1):
            review_id_i = review_ids[i]
            
            if review_id_i in duplicate_ids_to_delete:
                continue 
                
            for j in range(i+1, len(review_ids)-1):
                review_id_j = review_ids[j]
                
                if review_id_j in duplicate_ids_to_delete:
                    continue
                
                review_i = self.reviews[i]
                review_j = self.reviews[j]
                
                if get_jaccard_similarity(
                    review_i.text_review, 
                    review_j.text_review
                ) >= beta:

                    review_i.rating = min(review_i.rating, review_j.rating)
                    review_i.proportion = max(review_i.proportion, review_j.proportion)
                    review_i.duplicate_number += 1
                    duplicate_ids_to_delete.append(review_id_j)
                
    
        unique_review_list = [review for review in self.reviews if review.review_id not in duplicate_ids_to_delete]
        self.reviews = unique_review_list
    
    def print_review_ids(self):
        review_ids = self.get_review_ids()
        print(review_ids)
            
    def get_review_ids(self):
        review_ids = []
        for review in self.reviews:
            review_ids.append(review.review_id)
        return review_ids
    

### Создание объектов Тем и Отзывов

In [28]:
def get_topic_list():
    topic_list = []
    for topic_idx, word_list in enumerate(lda_H):
        topic = Topic(topic_idx, volume_feature_weight = 0.7, rating_feature_weight = 0.3)
        topic.key_words = [feature_names[i] for i in word_list.argsort()[::-1]]
        review_proportions = lda_W[:,topic_idx]

        review_list = []
        for review_id in range(0, len(reviews_rating)):
            review = Review(review_id, raw_reviews[review_id], review_proportions[review_id], reviews_rating[review_id],
                            proportion_feature_weight = 0.4, rating_feature_weight = 0.2, duplicate_feature_weight = 0.4)
            review_list.append(review)

        topic.reviews = review_list
        topic_list.append(topic)
    return topic_list

### Ранжирование Тем и Отзывов

In [29]:
def topics_ranking(topic_list):
    for topic in topic_list:
        topic.calculate_volume()
        topic.calculate_rating()
        topic.calculate_group_score()
        
    topic_list.sort(key=lambda topic: topic.group_score, reverse=True)

In [30]:
def reviews_ranking(topic_list, min_proportion_for_topic = 0.1, duplicate_similarity_score = 0.5):
    print(DIVIDER)
    for topic in topic_list:
        print('Topic {}: \n'.format(topic.topic_id))
        
        number_of_reviews = len(topic.reviews)
        print('Number of reviews: ', number_of_reviews)

        topic.eliminate_unrelated_reviews(min_proportion_for_topic)
        print('Unrelated eliminated: ', number_of_reviews - len(topic.reviews))

        number_of_reviews = len(topic.reviews)
        print('Number of reviews: ', number_of_reviews)

        topic.remove_duplicates(duplicate_similarity_score)
        print('Duplicate eliminated: ', number_of_reviews - len(topic.reviews))

        number_of_reviews = len(topic.reviews)
        print('Number of reviews: ', number_of_reviews)

        print(DIVIDER)
    
        for review in topic.reviews:
            review.calculate_review_score()
        
        topic.reviews.sort(key=lambda review: review.review_score, reverse=True)


In [31]:
def print_topics_with_reviews(topic_list):
    for topic in topic_list:
        print("Topic {}:\n\nKeywords: {}\n".format(
            topic.topic_id,
            (" ".join(topic.get_top_keywords(top_keywords_number = TOP_KEYWORDS_NUMBER)).upper())
        ))

        print("Top reviews from this topic:\n")
        for idx, review in enumerate(topic.get_top_reviews(top_review_number = TOP_REVIEWS_NUMBER)):
            print("{}) {}".format(idx, review.text_review, end=''))
            
        print(DIVIDER)

### Результаты тематического моделирования

In [32]:
MIN_PROPORTION_FOR_TOPIC = 0.1
DUPLICATE_SIMILARITY_SCORE = 0.5

In [33]:
TOP_KEYWORDS_NUMBER = 5
TOP_REVIEWS_NUMBER = 20

In [34]:
topic_list = get_topic_list()

print("\n Topic and review ranking process: \n")
topics_ranking(topic_list)
reviews_ranking(
                topic_list, 
                min_proportion_for_topic = MIN_PROPORTION_FOR_TOPIC, 
                duplicate_similarity_score = DUPLICATE_SIMILARITY_SCORE)



 Topic and review ranking process: 


-----------------------------------------------------------------

Topic 3: 

Number of reviews:  1661
Unrelated eliminated:  1055
Number of reviews:  606
Duplicate eliminated:  33
Number of reviews:  573

-----------------------------------------------------------------

Topic 9: 

Number of reviews:  1661
Unrelated eliminated:  1110
Number of reviews:  551
Duplicate eliminated:  35
Number of reviews:  516

-----------------------------------------------------------------

Topic 14: 

Number of reviews:  1661
Unrelated eliminated:  1212
Number of reviews:  449
Duplicate eliminated:  22
Number of reviews:  427

-----------------------------------------------------------------

Topic 2: 

Number of reviews:  1661
Unrelated eliminated:  1286
Number of reviews:  375
Duplicate eliminated:  18
Number of reviews:  357

-----------------------------------------------------------------

Topic 12: 

Number of reviews:  1661
Unrelated eliminated:  1548
Numb

In [35]:
print("\n Topics with top reviews: \n")
print(DIVIDER)
print_topics_with_reviews(topic_list)


 Topics with top reviews: 


-----------------------------------------------------------------

Topic 3:

Keywords: UPDATE TIME FORCE STATUS APP

Top reviews from this topic:

0) constantly force closes.

1) i can't update my status.. -_-"

2) slower, force closes all the time, doesn't load pictures..

3) Cant update status.

4) every time i try to open the app my phone freezes.

5) since last update i cant see any pictures...

6) ever since the latest update i have not been able to view photos on the app.

7) not to mention it force closes constantly.

8) keeps crashing, forcing it to close.

9) it also won't let me upload my pictures...

10) Still won't let me log in

11) It freezes all the time and kicks me out all the time.

12) since the latest update it always says "network error"

13) at first i wasn't going to let something like being unable to upload photos on a consistent basis bother me, but then all of the force closing, network error, and lag made me want to fli