**Task: [T347492](https://phabricator.wikimedia.org/T347492)**

1. [Data Gathering](#Data-Gathering)
2. [Resuts](#Results)

## Data-Gathering

In [2]:
import pandas as pd
import wmfdata as wmf

pd.options.display.max_columns = None
from IPython.display import clear_output

import warnings
from collections import Counter




You are using Wmfdata v2.0.0, but v2.0.1 is available.

To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release`.

To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md.


### spark_session

In [3]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

no active session


In [4]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='idwiki-mobile-reverts',
    spark_config={
        "spark.driver.memory": "4g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "16g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

clear_output()

spark_session

In [5]:
spark_session.sparkContext.setLogLevel("ERROR")

### query 

In [6]:
warnings.filterwarnings('ignore')

reverts_query = """
WITH
    reverts AS (
        SELECT
            MONTH(event_timestamp) AS month,
            revision_id,
            event_user_is_anonymous,
            CASE
                WHEN event_user_revision_count <= 250 THEN TRUE
                ELSE FALSE
            END AS is_newcomer,
            page_namespace_is_content,
            wiki_db
        FROM
            wmf.mediawiki_history
        WHERE 
            snapshot = '2023-09'
            AND wiki_db = 'idwiki'
            AND SIZE(event_user_is_bot_by) = 0
            AND YEAR(event_timestamp) = 2023
            AND revision_is_identity_revert
            AND MONTH(event_timestamp) IN (8, 9)
        )
        
SELECT
    rv.*,
    useragent.device_family,
    event.action
FROM
    reverts rv
    LEFT JOIN event.editattemptstep eas 
    ON rv.wiki_db = eas.wiki 
        AND rv.revision_id = event.revision_id
"""

reverts = (wmf
           .spark
           .run(reverts_query)
           .dropna()
           .drop_duplicates('revision_id', ignore_index=True))

clear_output()

In [7]:
# due to a bug https://phabricator.wikimedia.org/T249944; as Oct 2023, EditAttemptStep logs an editor as desktop user if they force desktop mode on mobile
# however, we can get close to accurate results by capturing various device families related to mobiles
# the initial list was from https://github.com/wikimedia-research/No-js-edit-analysis-2021/blob/main/wikitext_edits_nojs_analysis.ipynb

patterns = ['mobi','240x240','240x320','320x320','alcatel','android','audiovox','bada','benq','blackberry',
            'cdm-', 'compal-','docomo','ericsson','hiptop','htc[-_]','huawei','ipod','kddi-','kindle', 'meego',
            'midp','mitsu', 'mmp\\/','mot-','motor','ngm_','nintendo','opera.m','palm','panasonic','philips','phone',
            'playstation', 'portalmmm','sagem-','samsung','sanyo','sec-','sendo','sharp','silk','softbank',
            'symbian','teleca', 'up.browser','webos', 'oppo', 'infinix', 'xiaomi', 'redmi', 'vivo']

patterns_start = ['lg-', 'sie-', 'nec-', 'lge-', 'sgh-', 'pg-', 'sm-', 'rmx', 'cph']

In [8]:
# covert to lower case
reverts['device_family'] = reverts['device_family'].str.lower()

# join patterns with OR operator
pattern_str = '|'.join(patterns)
pattern_start_str = '|'.join(patterns_start)

# set default as desktop
reverts['device_type'] = 'desktop'

# change to mobile where there is a match
mask = reverts['device_family'].str.contains(pattern_str, regex=True) | reverts['device_family'].str.startswith(tuple(pattern_start_str))
reverts.loc[mask, 'device_type'] = 'mobile'

## Results

In [19]:
print('Proportion of reverts made by device (newcomers: less than 250 edits)')
print('\nabsolute')
print(reverts.query("""is_newcomer == True""").device_type.value_counts())
print('\nas percentage')
print(reverts.query("""is_newcomer == True""").device_type.value_counts(normalize=True) * 100)

Proportion of reverts made by device (newcomers: less than 250 edits)

absolute
desktop    654
mobile     243
Name: device_type, dtype: int64

as percentage
desktop    72.909699
mobile     27.090301
Name: device_type, dtype: float64


In [20]:
print('Proportion of reverts made by device (newcomers: less than 250 edits)')
print('\nabsolute')
print(reverts.query("""is_newcomer == False""").device_type.value_counts())
print('\nas percentage')
print(reverts.query("""is_newcomer == False""").device_type.value_counts(normalize=True) * 100)

Proportion of reverts made by device (newcomers: less than 250 edits)

absolute
desktop    5308
mobile     1915
Name: device_type, dtype: int64

as percentage
desktop    73.487471
mobile     26.512529
Name: device_type, dtype: float64


## Misc

In [None]:
# used to update list of device family
# a few updates include 'oppo', 'infinix', 'xiaomi', 'redmi', 'vivo' & 'sm-', 'rmx', 'cph'

def count_word_frequencies(text_list):
    combined_text = " ".join(filter(None, text_list))

    words = combined_text.split()

    word_freq = Counter(words)

    return dict(word_freq)

count_word_frequencies(reverts.dropna().drop_duplicates('revision_id').device_family.values)